aboutsummaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c176
1 files changed, 98 insertions, 78 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 749cdc110c74..76378bc257e3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -1257,8 +1258,8 @@ retry:
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
}
- count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
#endif
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
if (!add_to_swap(folio))
goto activate_locked_split;
}
@@ -1475,7 +1476,7 @@ free_it:
*/
nr_reclaimed += nr_pages;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1863,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
@@ -2128,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
static unsigned int reclaim_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat)
{
- struct reclaim_stat dummy_stat;
+ struct reclaim_stat stat;
unsigned int nr_reclaimed;
struct folio *folio;
struct scan_control sc = {
@@ -2139,12 +2140,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
folio_putback_lru(folio);
}
+ trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat);
return nr_reclaimed;
}
@@ -2601,8 +2603,6 @@ static bool should_clear_pmd_young(void)
* shorthand helpers
******************************************************************************/
-#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-
#define DEFINE_MAX_SEQ(lruvec) \
unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
@@ -3137,7 +3137,6 @@ static int folio_update_gen(struct folio *folio, int gen)
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- VM_WARN_ON_ONCE(!rcu_read_lock_held());
do {
/* lru_gen_del_folio() has isolated this page? */
@@ -3294,7 +3293,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
return false;
}
-static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
+ struct pglist_data *pgdat)
{
unsigned long pfn = pte_pfn(pte);
@@ -3306,13 +3306,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
return -1;
+ if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
+ return -1;
+
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return -1;
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return -1;
+
return pfn;
}
-static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
+ struct pglist_data *pgdat)
{
unsigned long pfn = pmd_pfn(pmd);
@@ -3324,9 +3331,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (WARN_ON_ONCE(pmd_devmap(pmd)))
return -1;
+ if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
+ return -1;
+
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return -1;
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return -1;
+
return pfn;
}
@@ -3335,15 +3348,11 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
{
struct folio *folio;
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- return NULL;
-
folio = pfn_folio(pfn);
if (folio_nid(folio) != pgdat->node_id)
return NULL;
- if (folio_memcg_rcu(folio) != memcg)
+ if (folio_memcg(folio) != memcg)
return NULL;
/* file VMAs can contain anon pages from COW */
@@ -3375,8 +3384,10 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ pmd_t pmdval;
- pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
+ &ptl);
if (!pte)
return false;
if (!spin_trylock(ptl)) {
@@ -3384,6 +3395,11 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
return false;
}
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+
arch_enter_lazy_mmu_mode();
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
@@ -3394,21 +3410,16 @@ restart:
total++;
walk->mm_stats[MM_LEAF_TOTAL]++;
- pfn = get_pte_pfn(ptent, args->vma, addr);
+ pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
if (pfn == -1)
continue;
- if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
- continue;
- }
-
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
if (!folio)
continue;
- if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ if (!ptep_clear_young_notify(args->vma, addr, pte + i))
+ continue;
young++;
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3474,21 +3485,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
/* don't round down the first address */
addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
- pfn = get_pmd_pfn(pmd[i], vma, addr);
- if (pfn == -1)
+ if (!pmd_present(pmd[i]))
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (!walk->force_scan && should_clear_pmd_young())
+ if (!walk->force_scan && should_clear_pmd_young() &&
+ !mm_has_notifiers(args->mm))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
+ pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
+ if (pfn == -1)
+ goto next;
+
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
if (!folio)
goto next;
- if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+ if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3546,27 +3561,18 @@ restart:
}
if (pmd_trans_huge(val)) {
- unsigned long pfn = pmd_pfn(val);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
walk->mm_stats[MM_LEAF_TOTAL]++;
- if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
- continue;
- }
-
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- continue;
-
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ if (pfn != -1)
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
- if (!walk->force_scan && should_clear_pmd_young()) {
+ if (!walk->force_scan && should_clear_pmd_young() &&
+ !mm_has_notifiers(args->mm)) {
if (!pmd_young(val))
continue;
@@ -3642,10 +3648,8 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
.p4d_entry = walk_pud_range,
.walk_lock = PGWALK_RDLOCK,
};
-
int err;
struct lruvec *lruvec = walk->lruvec;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3658,10 +3662,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
if (walk->seq != max_seq)
break;
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- break;
-
/* the caller might be holding the lock for write */
if (mmap_read_trylock(mm)) {
err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
@@ -3669,8 +3669,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
mmap_read_unlock(mm);
}
- mem_cgroup_unlock_pages();
-
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(walk);
@@ -4040,13 +4038,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* the PTE table to the Bloom filter. This forms a feedback loop between the
* eviction and the aging.
*/
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
- int young = 0;
+ int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
@@ -4062,12 +4060,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+ if (!ptep_clear_young_notify(vma, addr, pte))
+ return false;
+
if (spin_is_contended(pvmw->ptl))
- return;
+ return true;
/* exclude special VMAs containing anon pages from COW */
if (vma->vm_flags & VM_SPECIAL)
- return;
+ return true;
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4075,6 +4076,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
start = max(addr & PMD_MASK, vma->vm_start);
end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
+ if (end - start == PAGE_SIZE)
+ return true;
+
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4086,10 +4090,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
}
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- return;
-
arch_enter_lazy_mmu_mode();
pte -= (addr - start) / PAGE_SIZE;
@@ -4098,19 +4098,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
unsigned long pfn;
pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(ptent, vma, addr);
+ pfn = get_pte_pfn(ptent, vma, addr, pgdat);
if (pfn == -1)
continue;
- if (!pte_young(ptent))
- continue;
-
folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
if (!folio)
continue;
- if (!ptep_test_and_clear_young(vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ if (!ptep_clear_young_notify(vma, addr, pte + i))
+ continue;
young++;
@@ -4130,16 +4127,19 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
old_gen = folio_lru_gen(folio);
if (old_gen < 0)
folio_set_referenced(folio);
- else if (old_gen != new_gen)
+ else if (old_gen != new_gen) {
+ folio_clear_lru_refs(folio);
folio_activate(folio);
+ }
}
arch_leave_lazy_mmu_mode();
- mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+
+ return true;
}
/******************************************************************************
@@ -4284,6 +4284,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int tier_idx)
{
bool success;
+ bool dirty, writeback;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -4329,9 +4330,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
return true;
}
+ dirty = folio_test_dirty(folio);
+ writeback = folio_test_writeback(folio);
+ if (type == LRU_GEN_FILE && dirty) {
+ sc->nr.file_taken += delta;
+ if (!writeback)
+ sc->nr.unqueued_dirty += delta;
+ }
+
/* waiting for writeback */
- if (folio_test_locked(folio) || folio_test_writeback(folio) ||
- (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ if (folio_test_locked(folio) || writeback ||
+ (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4362,7 +4371,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on MAX_NR_TIERS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+ folio_clear_lru_refs(folio);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4447,7 +4456,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
-
+ if (type == LRU_GEN_FILE)
+ sc->nr.file_taken += isolated;
/*
* There might not be eligible folios due to reclaim_idx. Check the
* remaining to prevent livelock if it's not making progress.
@@ -4581,6 +4591,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
return scanned;
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
scanned, reclaimed, &stat, sc->priority,
@@ -4789,6 +4800,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
}
+ /*
+ * If too many file cache in the coldest generation can't be evicted
+ * due to being dirty, wake up the flusher.
+ */
+ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
/* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -4963,8 +4981,8 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
blk_finish_plug(&plug);
done:
- /* kswapd should never fail */
- pgdat->kswapd_failures = 0;
+ if (sc->nr_reclaimed > reclaimed)
+ pgdat->kswapd_failures = 0;
}
/******************************************************************************
@@ -5254,11 +5272,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5298,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
@@ -5934,6 +5952,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
bool reclaimable = false;
if (lru_gen_enabled() && root_reclaim(sc)) {
+ memset(&sc->nr, 0, sizeof(sc->nr));
lru_gen_shrink_node(pgdat, sc);
return;
}
@@ -5984,7 +6003,8 @@ again:
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/
- if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ if (sc->nr.unqueued_dirty &&
+ sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*