diff options
author | Nick Terrell <terrelln@fb.com> | 2022-12-13 16:21:55 -0800 |
---|---|---|
committer | Nick Terrell <terrelln@fb.com> | 2022-12-13 16:21:55 -0800 |
commit | 4f2c0a4acffbec01079c28f839422e64ddeff004 (patch) | |
tree | 06ada4a8a6d94a94c93944806041b8c994cebfc5 /mm/page_alloc.c | |
parent | 88a309465b3f05a100c3b81966982c0f9f5d23a6 (diff) | |
parent | 830b3c68c1fb1e9176028d02ef86f3cf76aa2476 (diff) | |
download | linux-4f2c0a4acffbec01079c28f839422e64ddeff004.tar.gz linux-4f2c0a4acffbec01079c28f839422e64ddeff004.tar.bz2 linux-4f2c0a4acffbec01079c28f839422e64ddeff004.zip |
Merge branch 'main' into zstd-linus
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 1634 |
1 files changed, 893 insertions, 741 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d31..6e60657875d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kasan.h> +#include <linux/kmsan.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -81,6 +82,7 @@ #include "internal.h" #include "shuffle.h" #include "page_reporting.h" +#include "swap.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; @@ -125,13 +127,97 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -struct pagesets { - local_lock_t lock; -}; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { - .lock = INIT_LOCAL_LOCK(lock), -}; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + +/* + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid + * a migration causing the wrong PCP to be locked and remote memory being + * potentially allocated, pin the task to the CPU for the lookup+lock. + * preempt_disable is used on !RT because it is faster than migrate_disable. + * migrate_disable is used on RT because otherwise RT spinlock usage is + * interfered with and a high priority task cannot preempt the allocator. + */ +#ifndef CONFIG_PREEMPT_RT +#define pcpu_task_pin() preempt_disable() +#define pcpu_task_unpin() preempt_enable() +#else +#define pcpu_task_pin() migrate_disable() +#define pcpu_task_unpin() migrate_enable() +#endif +/* + * Generic helper to lookup and a per-cpu variable with an embedded spinlock. + * Return value should be used with equivalent unlock helper. + */ +#define pcpu_spin_lock(type, member, ptr) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock(&_ret->member); \ + _ret; \ +}) + +#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock_irqsave(&_ret->member, flags); \ + _ret; \ +}) + +#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + pcpu_task_unpin(); \ + _ret = NULL; \ + } \ + _ret; \ +}) + +#define pcpu_spin_unlock(member, ptr) \ +({ \ + spin_unlock(&ptr->member); \ + pcpu_task_unpin(); \ +}) + +#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ +({ \ + spin_unlock_irqrestore(&ptr->member, flags); \ + pcpu_task_unpin(); \ +}) + +/* struct per_cpu_pages specific helpers. */ +#define pcp_spin_lock(ptr) \ + pcpu_spin_lock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_lock_irqsave(ptr, flags) \ + pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_trylock_irqsave(ptr, flags) \ + pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) + +#define pcp_spin_unlock_irqrestore(ptr, flags) \ + pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -150,13 +236,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); #endif -/* work_structs for global per-cpu drains */ -struct pcpu_drain { - struct zone *zone; - struct work_struct work; -}; static DEFINE_MUTEX(pcpu_drain_mutex); -static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -355,7 +435,7 @@ static unsigned long required_kernelcore_percent __initdata; static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; -static bool mirrored_kernelcore __meminitdata; +bool mirrored_kernelcore __initdata_memblock; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -378,25 +458,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -419,6 +483,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; + if (early_page_ext_enabled()) + return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. @@ -447,11 +513,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -481,7 +545,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); #else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); #endif /* CONFIG_SPARSEMEM */ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } @@ -499,8 +563,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; + /* + * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * a consistent read of the memory array, so that results, even though + * racy, are not corrupted. + */ + word = READ_ONCE(bitmap[word_bitidx]); return (word >> bitidx) & mask; } @@ -537,7 +605,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, { unsigned long *bitmap; unsigned long bitidx, word_bitidx; - unsigned long old_word, word; + unsigned long word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); @@ -553,12 +621,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, flags <<= bitidx; word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } + do { + } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); } void set_pageblock_migratetype(struct page *page, int migratetype) @@ -666,7 +730,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); - base = PAGE_ALLOC_COSTLY_ORDER + 1; + return NR_LOWORDER_PCP_LISTS; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -680,7 +744,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (pindex == NR_LOWORDER_PCP_LISTS) order = pageblock_order; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -734,8 +798,7 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -744,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) @@ -758,6 +822,14 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } +void destroy_large_folio(struct folio *folio) +{ + enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); + compound_page_dtors[dtor](&folio->page); +} + #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; @@ -799,10 +871,11 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, return false; __SetPageGuard(page); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << order), migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } @@ -832,7 +905,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * order of appearance. So we need to first gather the full picture of what was * enabled, and then make decisions. */ -void init_mem_debugging_and_hardening(void) +void __init init_mem_debugging_and_hardening(void) { bool page_poisoning_requested = false; @@ -867,6 +940,10 @@ void init_mem_debugging_and_hardening(void) else static_branch_disable(&init_on_free); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -886,40 +963,6 @@ static inline void set_buddy_order(struct page *page, unsigned int order) __SetPageBuddy(page); } -/* - * This function checks whether a page is free && is the buddy - * we can coalesce a page and its buddy if - * (a) the buddy is not in a hole (check before calling!) && - * (b) the buddy is in the buddy system && - * (c) a page and its buddy have the same order && - * (d) a page and its buddy are in the same zone. - * - * For recording whether a page is in the buddy system, we set PageBuddy. - * Setting, clearing, and testing PageBuddy is serialized by zone->lock. - * - * For recording page's order, we use page_private(page). - */ -static inline bool page_is_buddy(struct page *page, struct page *buddy, - unsigned int order) -{ - if (!page_is_guard(buddy) && !PageBuddy(buddy)) - return false; - - if (buddy_order(buddy) != order) - return false; - - /* - * zone check is done late to avoid uselessly calculating - * zone/node ids for pages that could never merge. - */ - if (page_zone_id(page) != page_zone_id(buddy)) - return false; - - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - - return true; -} - #ifdef CONFIG_COMPACTION static inline struct capture_control *task_capc(struct zone *zone) { @@ -976,7 +1019,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add(&page->lru, &area->free_list[migratetype]); + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -986,7 +1029,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->lru, &area->free_list[migratetype]); + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -1000,7 +1043,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->buddy_list, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -1010,7 +1053,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, if (page_reported(page)) __ClearPageReported(page); - list_del(&page->lru); + list_del(&page->buddy_list); __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; @@ -1028,18 +1071,17 @@ static inline bool buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, struct page *page, unsigned int order) { - struct page *higher_page, *higher_buddy; - unsigned long combined_pfn; + unsigned long higher_page_pfn; + struct page *higher_page; if (order >= MAX_ORDER - 2) return false; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); + higher_page_pfn = buddy_pfn & pfn; + higher_page = page + (higher_page_pfn - pfn); - return page_is_buddy(higher_page, higher_buddy, order + 1); + return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, + NULL) != NULL; } /* @@ -1072,14 +1114,11 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned long buddy_pfn; + unsigned long buddy_pfn = 0; unsigned long combined_pfn; - unsigned int max_order; struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); - VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -1090,18 +1129,32 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); -continue_merging: - while (order < max_order) { + while (order < MAX_ORDER - 1) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); return; } - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - if (!page_is_buddy(page, buddy, order)) + buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); + if (!buddy) goto done_merging; + + if (unlikely(order >= pageblock_order)) { + /* + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. + */ + int buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; + } + /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -1115,30 +1168,6 @@ continue_merging: pfn = combined_pfn; order++; } - if (order < MAX_ORDER - 1) { - /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on isolate - * pageblock and normal pageblock. Without this, pageblock - * isolation could cause incorrect freepage or CMA accounting. - * - * We don't want to hit this code for the more frequent - * low-order merging. - */ - if (unlikely(has_isolate_pageblock(zone))) { - int buddy_mt; - - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - buddy_mt = get_pageblock_migratetype(buddy); - - if (migratetype != buddy_mt - && (is_migrate_isolate(migratetype) || - is_migrate_isolate(buddy_mt))) - goto done_merging; - } - max_order = order + 1; - goto continue_merging; - } done_merging: set_buddy_order(page, order); @@ -1160,6 +1189,64 @@ done_merging: page_reporting_notify_free(order); } +/** + * split_free_page() -- split a free page at split_pfn_offset + * @free_page: the original free page + * @order: the order of the page + * @split_pfn_offset: split offset within the page + * + * Return -ENOENT if the free page is changed, otherwise 0 + * + * It is used when the free page crosses two pageblocks with different migratetypes + * at split_pfn_offset within the page. The split free page will be put into + * separate migratetype lists afterwards. Otherwise, the function achieves + * nothing. + */ +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset) +{ + struct zone *zone = page_zone(free_page); + unsigned long free_page_pfn = page_to_pfn(free_page); + unsigned long pfn; + unsigned long flags; + int free_page_order; + int mt; + int ret = 0; + + if (split_pfn_offset == 0) + return ret; + + spin_lock_irqsave(&zone->lock, flags); + + if (!PageBuddy(free_page) || buddy_order(free_page) != order) { + ret = -ENOENT; + goto out; + } + + mt = get_pageblock_migratetype(free_page); + if (likely(!is_migrate_isolate(mt))) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + + del_page_from_free_list(free_page, zone, order); + for (pfn = free_page_pfn; + pfn < free_page_pfn + (1UL << order);) { + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); + + free_page_order = min_t(unsigned int, + pfn ? __ffs(pfn) : order, + __fls(split_pfn_offset)); + __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, + mt, FPI_NONE); + pfn += 1UL << free_page_order; + split_pfn_offset -= (1UL << free_page_order); + /* we have done the first part, now switch to second part */ + if (split_pfn_offset == 0) + split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); + } +out: + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@ -1205,20 +1292,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void check_free_page_bad(struct page *page) +static void free_page_is_bad_report(struct page *page) { bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int check_free_page(struct page *page) +static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) - return 0; + return false; /* Something has gone sideways, find it */ - check_free_page_bad(page); - return 1; + free_page_is_bad_report(page); + return true; } static int free_tail_pages_check(struct page *head_page, struct page *page) @@ -1271,24 +1358,43 @@ out: return ret; } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - int i; + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } +static void kernel_init_pages(struct page *page, int numpages) +{ + int i; /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) { - u8 tag = page_kasan_tag(page + i); - page_kasan_tag_reset(page + i); - clear_highpage(page + i); - page_kasan_tag_set(page + i, tag); - } + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); kasan_enable_current(); } @@ -1296,11 +1402,12 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + kmsan_free_page(page, order); if (unlikely(PageHWPoison(page)) && !order) { /* @@ -1331,7 +1438,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(check_free_page(page + i))) { + if (unlikely(free_page_is_bad(page + i))) { bad++; continue; } @@ -1342,8 +1449,8 @@ static __always_inline bool free_pages_prepare(struct page *page, page->mapping = NULL; if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); - if (check_free) - bad += check_free_page(page); + if (check_free && free_page_is_bad(page)) + bad++; if (bad) return false; @@ -1363,23 +1470,21 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - if (!skip_kasan_poison) - kasan_free_pages(page, order); - } else { - bool init = want_init_on_free(); + if (!should_skip_kasan_poison(page, fpi_flags)) { + kasan_poison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, false); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; } + if (init) + kernel_init_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1404,10 +1509,11 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) return free_pages_prepare(page, order, true, FPI_NONE); } +/* return true if this page has an inappropriate state */ static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return check_free_page(page); + return free_page_is_bad(page); else return false; } @@ -1428,124 +1534,83 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { - return check_free_page(page); + return free_page_is_bad(page); } #endif /* CONFIG_DEBUG_VM */ -static inline void prefetch_buddy(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); - struct page *buddy = page + (buddy_pfn - pfn); - - prefetch(buddy); -} - /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone. * count is the number of pages to free. */ static void free_pcppages_bulk(struct zone *zone, int count, - struct per_cpu_pages *pcp) + struct per_cpu_pages *pcp, + int pindex) { - int pindex = 0; - int batch_free = 0; - int nr_freed = 0; + int min_pindex = 0; + int max_pindex = NR_PCP_LISTS - 1; unsigned int order; - int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; - struct page *page, *tmp; - LIST_HEAD(head); + struct page *page; /* * Ensure proper count is passed which otherwise would stuck in the * below while (list_empty(list)) loop. */ count = min(pcp->count, count); + + /* Ensure requested pindex is drained first. */ + pindex = pindex - 1; + + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + while (count > 0) { struct list_head *list; + int nr_pages; - /* - * Remove pages from lists in a round-robin fashion. A - * batch_free count is maintained that is incremented when an - * empty list is encountered. This is so more pages are freed - * off fuller lists instead of spinning excessively around empty - * lists - */ + /* Remove pages from lists in a round-robin fashion. */ do { - batch_free++; - if (++pindex == NR_PCP_LISTS) - pindex = 0; + if (++pindex > max_pindex) + pindex = min_pindex; list = &pcp->lists[pindex]; - } while (list_empty(list)); + if (!list_empty(list)) + break; - /* This is the only non-empty list. Free them all. */ - if (batch_free == NR_PCP_LISTS) - batch_free = count; + if (pindex == max_pindex) + max_pindex--; + if (pindex == min_pindex) + min_pindex++; + } while (1); order = pindex_to_order(pindex); - BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); + nr_pages = 1 << order; do { - page = list_last_entry(list, struct page, lru); + int mt; + + page = list_last_entry(list, struct page, pcp_list); + mt = get_pcppage_migratetype(page); + /* must delete to avoid corrupting pcp list */ - list_del(&page->lru); - nr_freed += 1 << order; - count -= 1 << order; + list_del(&page->pcp_list); + count -= nr_pages; + pcp->count -= nr_pages; if (bulkfree_pcp_prepare(page)) continue; - /* Encode order with the migratetype */ - page->index <<= NR_PCP_ORDER_WIDTH; - page->index |= order; + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); - list_add_tail(&page->lru, &head); - - /* - * We are going to put the page back to the global - * pool, prefetch its buddy to speed up later access - * under zone->lock. It is believed the overhead of - * an additional test and calculating buddy_pfn here - * can be offset by reduced memory latency later. To - * avoid excessive prefetching due to large count, only - * prefetch buddy for the first pcp->batch nr of pages. - */ - if (prefetch_nr) { - prefetch_buddy(page); - prefetch_nr--; - } - } while (count > 0 && --batch_free && !list_empty(list)); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); + } while (count > 0 && !list_empty(list)); } - pcp->count -= nr_freed; - - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); - /* - * Use safe version since after __free_one_page(), - * page->lru.next will not point to original list. - */ - list_for_each_entry_safe(page, tmp, &head, lru) { - int mt = get_pcppage_migratetype(page); - - /* mt has been encoded with the order (see above) */ - order = mt & NR_PCP_ORDER_MASK; - mt >>= NR_PCP_ORDER_WIDTH; - - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, order, mt); - } spin_unlock(&zone->lock); } @@ -1749,6 +1814,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; + if (!kmsan_memblock_free_pages(page, order)) { + /* KMSAN will take care of these pages. */ + return; + } __free_pages_core(page, order); } @@ -1800,7 +1869,7 @@ void set_zone_contiguous(struct zone *zone) unsigned long block_start_pfn = zone->zone_start_pfn; unsigned long block_end_pfn; - block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(block_start_pfn); for (; block_start_pfn < zone_end_pfn(zone); block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1835,15 +1904,14 @@ static void __init deferred_free_range(unsigned long pfn, page = pfn_to_page(pfn); /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { + if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) + if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, 0); } @@ -1862,16 +1930,12 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * First we check if pfn is valid on architectures where it is possible to have - * holes within pageblock_nr_pages. On systems where it is not possible, this - * function is optimized out. - * - * Then, we check if a current large page is valid by only checking the validity + * We check if a current large page is valid by only checking the validity * of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + if (pageblock_aligned(pfn) && !pfn_valid(pfn)) return false; return true; } @@ -1883,14 +1947,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn) static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; - } else if (!(pfn & nr_pgmask)) { + } else if (pageblock_aligned(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; } else { @@ -1910,7 +1973,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; int nid = zone_to_nid(zone); unsigned long nr_pages = 0; int zid = zone_idx(zone); @@ -1920,7 +1982,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, if (!deferred_pfn_valid(pfn)) { page = NULL; continue; - } else if (!page || !(pfn & nr_pgmask)) { + } else if (!page || pageblock_aligned(pfn)) { page = pfn_to_page(pfn); } else { page++; @@ -2260,19 +2322,8 @@ void __init init_cma_reserved_pageblock(struct page *page) } while (++p, --i); set_pageblock_migratetype(page, MIGRATE_CMA); - - if (pageblock_order >= MAX_ORDER) { - i = pageblock_nr_pages; - p = page; - do { - set_page_refcounted(p); - __free_pages(p, MAX_ORDER - 1); - p += MAX_ORDER_NR_PAGES; - } while (i -= MAX_ORDER_NR_PAGES); - } else { - set_page_refcounted(page); - __free_pages(page, pageblock_order); - } + set_page_refcounted(page); + __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; @@ -2342,23 +2393,36 @@ static inline int check_new_page(struct page *page) return 1; } +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + #ifdef CONFIG_DEBUG_VM /* * With DEBUG_VM enabled, order-0 pages are checked for expected state when * being allocated from pcp lists. With debug_pagealloc also enabled, they are * also checked when pcp lists are refilled from the free lists. */ -static inline bool check_pcp_refill(struct page *page) +static inline bool check_pcp_refill(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return check_new_page(page); + return check_new_pages(page, order); else return false; } -static inline bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page, unsigned int order) { - return check_new_page(page); + return check_new_pages(page, order); } #else /* @@ -2366,35 +2430,55 @@ static inline bool check_new_pcp(struct page *page) * when pcp lists are being refilled from the free lists. With debug_pagealloc * enabled, they are also checked when being allocated from the pcp lists. */ -static inline bool check_pcp_refill(struct page *page) +static inline bool check_pcp_refill(struct page *page, unsigned int order) { - return check_new_page(page); + return check_new_pages(page, order); } -static inline bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return check_new_page(page); + return check_new_pages(page, order); else return false; } #endif /* CONFIG_DEBUG_VM */ -static bool check_new_pages(struct page *page, unsigned int order) +static inline bool should_skip_kasan_unpoison(gfp_t flags) { - int i; - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; - if (unlikely(check_new_page(p))) - return true; - } + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; - return false; + /* + * With hardware tag-based KASAN enabled, skip if this has been + * requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return flags & __GFP_SKIP_KASAN_UNPOISON; +} + +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); } inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + int i; + set_page_private(page, 0); set_page_refcounted(page); @@ -2410,19 +2494,40 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ - if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } + if (!should_skip_kasan_unpoison(gfp_flags)) { + /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + } else { + /* Ensure page_address() dereferencing does not fault. */ + for (i = 0; i != 1 << order; ++i) + page_kasan_tag_reset(page + i); } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_pages(page, 1 << order); + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); @@ -2469,6 +2574,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, del_page_from_free_list(page, zone, current_order); expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); return page; } @@ -2479,17 +2587,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted + * + * The other migratetypes do not have fallbacks. */ static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, -#ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ -#endif }; #ifdef CONFIG_CMA @@ -2554,8 +2658,8 @@ int move_freepages_block(struct zone *zone, struct page *page, *num_movable = 0; pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages - 1; + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) @@ -2795,8 +2899,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) - && !is_migrate_cma(mt)) { + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); @@ -2913,7 +3017,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ - if (alloc_flags & ALLOC_NOFRAGMENT) + if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order; /* @@ -2996,7 +3100,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, zone_page_state(zone, NR_FREE_PAGES) / 2) { page = __rmqueue_cma_fallback(zone, order); if (page) - goto out; + return page; } } retry: @@ -3009,9 +3113,6 @@ retry: alloc_flags)) goto retry; } -out: - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -3026,10 +3127,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3037,7 +3135,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; - if (unlikely(check_pcp_refill(page))) + if (unlikely(check_pcp_refill(page, order))) continue; /* @@ -3050,7 +3148,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * for IO devices that can merge IO requests if the physical * pages are ordered properly. */ - list_add_tail(&page->lru, list); + list_add_tail(&page->pcp_list, list); allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -3073,51 +3171,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to drain pagesets of this * currently executing processor on remote nodes after they have * expired. - * - * Note that this function must be called with the thread pinned to - * a single processor. */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) - free_pcppages_bulk(zone, to_drain, pcp); - local_unlock_irqrestore(&pagesets.lock, flags); + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, to_drain, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif /* * Drain pcplists of the indicated processor and zone. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* * Drain pcplists of all zones on the indicated processor. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages(unsigned int cpu) { @@ -3130,9 +3225,6 @@ static void drain_pages(unsigned int cpu) /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. - * - * The CPU has to be pinned. When zone parameter is non-NULL, spill just - * the single zone's pages. */ void drain_local_pages(struct zone *zone) { @@ -3144,24 +3236,6 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } -static void drain_local_pages_wq(struct work_struct *work) -{ - struct pcpu_drain *drain; - - drain = container_of(work, struct pcpu_drain, work); - - /* - * drain_all_pages doesn't use proper cpu hotplug protection so - * we can race with cpu offline when the WQ can move this from - * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is alright but we also have to make sure to not move to - * a different one. - */ - migrate_disable(); - drain_local_pages(drain->zone); - migrate_enable(); -} - /* * The implementation of drain_all_pages(), exposing an extra parameter to * drain on all cpus. @@ -3183,13 +3257,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) static cpumask_t cpus_with_pcps; /* - * Make sure nobody triggers this path before mm_percpu_wq is fully - * initialized. - */ - if (WARN_ON_ONCE(!mm_percpu_wq)) - return; - - /* * Do not drain if one is already in progress unless it's specific to * a zone. Such callers are primarily CMA and memory hotplug and need * the drain to be complete when the call returns. @@ -3238,14 +3305,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) } for_each_cpu(cpu, &cpus_with_pcps) { - struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); - - drain->zone = zone; - INIT_WORK(&drain->work, drain_local_pages_wq); - queue_work_on(cpu, mm_percpu_wq, &drain->work); + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } - for_each_cpu(cpu, &cpus_with_pcps) - flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); mutex_unlock(&pcpu_drain_mutex); } @@ -3254,8 +3318,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. - * - * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -3300,7 +3362,7 @@ void mark_free_pages(struct zone *zone) for_each_migratetype_order(order, t) { list_for_each_entry(page, - &zone->free_area[order].free_list[t], lru) { + &zone->free_area[order].free_list[t], buddy_list) { unsigned long i; pfn = page_to_pfn(page); @@ -3330,10 +3392,15 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, + bool free_high) { int min_nr_free, max_nr_free; + /* Free everything if batch freeing high-order pages. */ + if (unlikely(free_high)) + return pcp->count; + /* Check for PCP disabled or boot pageset */ if (unlikely(high < batch)) return 1; @@ -3354,11 +3421,12 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) return batch; } -static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, + bool free_high) { int high = READ_ONCE(pcp->high); - if (unlikely(!high)) + if (unlikely(!high || free_high)) return 0; if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) @@ -3371,24 +3439,32 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype, unsigned int order) +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, + unsigned int order) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; int high; int pindex; + bool free_high; - __count_vm_event(PGFREE); - pcp = this_cpu_ptr(zone->per_cpu_pageset); + __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); - list_add(&page->lru, &pcp->lists[pindex]); + list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; - high = nr_pcp_high(pcp, zone); + + /* + * As high-order pages other than THP's stored on PCP can contribute + * to fragmentation, limit the number stored when PCP is heavily + * freeing without allocation. The remainder after bulk freeing + * stops will be drained from vmstat refresh context. + */ + free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); + + high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); } } @@ -3398,6 +3474,9 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; unsigned long pfn = page_to_pfn(page); int migratetype; @@ -3420,9 +3499,16 @@ void free_unref_page(struct page *page, unsigned int order) migratetype = MIGRATE_MOVABLE; } - local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype, order); - local_unlock_irqrestore(&pagesets.lock, flags); + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (pcp) { + free_unref_page_commit(zone, pcp, page, migratetype, order); + pcp_spin_unlock_irqrestore(pcp, flags); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); } /* @@ -3431,13 +3517,15 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; - unsigned long flags, pfn; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; + unsigned long flags; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - pfn = page_to_pfn(page); + unsigned long pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); continue; @@ -3453,14 +3541,19 @@ void free_unref_page_list(struct list_head *list) free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); continue; } - - set_page_private(page, pfn); } - local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - pfn = page_private(page); - set_page_private(page, 0); + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) + pcp_spin_unlock_irqrestore(pcp, flags); + + locked_zone = zone; + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); + } /* * Non-isolated types over MIGRATE_PCPTYPES get added @@ -3471,19 +3564,21 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype, 0); + free_unref_page_commit(zone, pcp, page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); batch_count = 0; - local_lock_irqsave(&pagesets.lock, flags); + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } } - local_unlock_irqrestore(&pagesets.lock, flags); + + if (pcp) + pcp_spin_unlock_irqrestore(pcp, flags); } /* @@ -3510,16 +3605,11 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { - unsigned long watermark; - struct zone *zone; - int mt; - - BUG_ON(!PageBuddy(page)); - - zone = page_zone(page); - mt = get_pageblock_migratetype(page); + struct zone *zone = page_zone(page); + int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { + unsigned long watermark; /* * Obey watermarks as if the page was being allocated. We can * emulate a high-order watermark check with a raised order-0 @@ -3533,8 +3623,6 @@ int __isolate_free_page(struct page *page, unsigned int order) __mod_zone_freepage_state(zone, -(1UL << order), mt); } - /* Remove page from free list */ - del_page_from_free_list(page, zone, order); /* @@ -3545,14 +3633,16 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && !is_migrate_highatomic(mt)) + /* + * Only change normal pageblocks (i.e., they can merge + * with others) + */ + if (migratetype_is_mergeable(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } } - return 1UL << order; } @@ -3579,8 +3669,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) /* * Update NUMA hit/miss statistics - * - * Must be called with interrupts disabled. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, long nr_account) @@ -3605,6 +3693,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, #endif } +static __always_inline +struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + unsigned int order, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + unsigned long flags; + + do { + page = NULL; + spin_lock_irqsave(&zone->lock, flags); + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return NULL; + } + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + + return page; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -3638,10 +3763,10 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, return NULL; } - page = list_first_entry(list, struct page, lru); - list_del(&page->lru); + page = list_first_entry(list, struct page, pcp_list); + list_del(&page->pcp_list); pcp->count -= 1 << order; - } while (check_new_pcp(page)); + } while (check_new_pcp(page, order)); return page; } @@ -3649,45 +3774,68 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; - local_lock_irqsave(&pagesets.lock, flags); + /* + * spin_trylock may fail due to a parallel drain. In the future, the + * trylock will also protect against IRQ reentrancy. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) { + pcp_trylock_finish(UP_flags); + return NULL; + } /* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); + pcp_trylock_finish(UP_flags); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); } return page; } /* - * Allocate a page from the given zone. Use pcplists for order-0 allocations. + * Allocate a page from the given zone. + * Use pcplists for THP or "cheap" high-order allocations. */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { - unsigned long flags; struct page *page; + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and @@ -3696,57 +3844,24 @@ struct page *rmqueue(struct zone *preferred_zone, if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); - goto out; + migratetype, alloc_flags); + if (likely(page)) + goto out; } } - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); - - do { - page = NULL; - /* - * order-0 request can reach here when the pcplist is skipped - * due to non-CMA allocation context. HIGHATOMIC area is - * reserved for high-order atomic allocation, so order-0 - * request should skip it. - */ - if (order > 0 && alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype, alloc_flags); - } while (page && check_new_pages(page, order)); - if (!page) - goto failed; - - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); - - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, 1); + page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, + migratetype); out: /* Separate test+clear to avoid unnecessary atomics */ - if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; - -failed: - spin_unlock_irqrestore(&zone->lock, flags); - return NULL; } #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -3772,6 +3887,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { + int flags = 0; + if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) @@ -3782,7 +3899,11 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; - return should_fail(&fail_page_alloc.attr, 1 << order); + /* See comment in __should_failslab() */ + if (gfp_mask & __GFP_NOWARN) + flags |= FAULT_NOWARN; + + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -3936,11 +4057,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * need to be calculated. */ if (!order) { - long fast_free; + long usable_free; + long reserved; + + usable_free = free_pages; + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); - fast_free = free_pages; - fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags); - if (fast_free > mark + z->lowmem_reserve[highest_zoneidx]) + /* reserved may over estimate high-atomic reserves. */ + usable_free -= min(usable_free, reserved); + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) return true; } @@ -4051,13 +4176,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z; struct zone *zone; - struct pglist_data *last_pgdat_dirty_limit = NULL; + struct pglist_data *last_pgdat = NULL; + bool last_pgdat_dirty_ok = false; bool no_fallback; retry: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; @@ -4090,13 +4216,13 @@ retry: * dirty-throttling and the flusher threads. */ if (ac->spread_dirty_pages) { - if (last_pgdat_dirty_limit == zone->zone_pgdat) - continue; + if (last_pgdat != zone->zone_pgdat) { + last_pgdat = zone->zone_pgdat; + last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat); + } - if (!node_dirty_ok(zone->zone_pgdat)) { - last_pgdat_dirty_limit = zone->zone_pgdat; + if (!last_pgdat_dirty_ok) continue; - } } if (no_fallback && nr_online_nodes > 1 && @@ -4211,7 +4337,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - show_mem(filter, nodemask); + __show_mem(filter, nodemask, gfp_zone(gfp_mask)); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -4329,7 +4455,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, */ /* Exhausted what can be done so it's blame time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + if (out_of_memory(&oc) || + WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) { *did_some_progress = 1; /* @@ -4589,19 +4716,42 @@ void fs_reclaim_release(gfp_t gfp_mask) EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + /* Perform direct synchronous page reclaim */ static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { unsigned int noreclaim_flag; - unsigned long pflags, progress; + unsigned long progress; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); @@ -4610,7 +4760,6 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); - psi_memstall_leave(&pflags); cond_resched(); @@ -4624,11 +4773,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long *did_some_progress) { struct page *page = NULL; + unsigned long pflags; bool drained = false; + psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) - return NULL; + goto out; retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); @@ -4644,6 +4795,8 @@ retry: drained = true; goto retry; } +out: + psi_memstall_leave(&pflags); return page; } @@ -4658,9 +4811,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { - if (last_pgdat != zone->zone_pgdat) + if (!managed_zone(zone)) + continue; + if (last_pgdat != zone->zone_pgdat) { wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); - last_pgdat = zone->zone_pgdat; + last_pgdat = zone->zone_pgdat; + } } } @@ -4877,6 +5033,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; int reserve_flags; /* @@ -4887,11 +5044,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry_cpuset: +restart: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); /* * The fast path uses conservative alloc_flags to succeed only until @@ -4997,7 +5155,8 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | + (alloc_flags & ALLOC_KSWAPD); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -5063,9 +5222,13 @@ retry: goto retry; - /* Deal with possible cpuset update races before we start OOM killing */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -5085,9 +5248,13 @@ retry: } nopage: - /* Deal with possible cpuset update races before we fail */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure @@ -5098,7 +5265,7 @@ nopage: * All existing users of the __GFP_NOFAIL are blockable, so warn * of any new users that actually require GFP_NOWAIT */ - if (WARN_ON_ONCE(!can_direct_reclaim)) + if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) goto fail; /* @@ -5106,7 +5273,7 @@ nopage: * because we cannot reclaim anything and only can loop waiting * for somebody to do a work for us */ - WARN_ON_ONCE(current->flags & PF_MEMALLOC); + WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); /* * non failing costly orders are a hard requirement which we @@ -5114,7 +5281,7 @@ nopage: * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* * Help non-failing allocations by giving them access to memory @@ -5158,10 +5325,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - fs_reclaim_acquire(gfp_mask); - fs_reclaim_release(gfp_mask); - - might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); + might_alloc(gfp_mask); if (should_fail_alloc_page(gfp_mask, order)) return false; @@ -5209,6 +5373,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, { struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5289,11 +5454,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; + /* Is a parallel drain in progress? */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) + goto failed_irq; + /* Attempt the batch allocation */ - local_lock_irqsave(&pagesets.lock, flags); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; - while (nr_populated < nr_pages) { /* Skip existing pages */ @@ -5305,9 +5473,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { - /* Try and get at least one page */ - if (!nr_populated) + /* Try and allocate at least one page */ + if (!nr_account) { + pcp_spin_unlock_irqrestore(pcp, flags); goto failed_irq; + } break; } nr_account++; @@ -5320,7 +5490,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); + pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); @@ -5329,7 +5500,7 @@ out: return nr_populated; failed_irq: - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_trylock_finish(UP_flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); @@ -5360,10 +5531,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); + if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp)) return NULL; - } gfp &= gfp_allowed_mask; /* @@ -5409,6 +5578,7 @@ out: } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); return page; } @@ -5580,6 +5750,18 @@ refill: /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } } nc->pagecnt_bias--; @@ -5606,14 +5788,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); + unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); + struct page *page = virt_to_page((void *)addr); + struct page *last = page + nr; - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); + + last = page + (1UL << order); + for (page += nr; page < last; page++) + __free_pages_ok(page, 0, FPI_TO_TAIL); } return (void *)addr; } @@ -5762,14 +5948,14 @@ long si_mem_available(void) /* * Estimate the amount of memory available for userspace allocations, - * without causing swapping. + * without causing swapping or OOM. */ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will - * start swapping. Assume at least half of the page cache, or the - * low watermark worth of cache, needs to stay. + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. */ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; pagecache -= min(pagecache / 2, wmark_low); @@ -5885,6 +6071,15 @@ static void show_migration_types(unsigned char type) printk(KERN_CONT "(%s) ", tmp); } +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -5894,14 +6089,16 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter, nodemask_t *nodemask) +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; - int cpu; + int cpu, nid; struct zone *zone; pg_data_t *pgdat; for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -5913,7 +6110,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), @@ -5930,6 +6128,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), @@ -5939,6 +6138,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; printk("Node %d" " active_anon:%lukB" @@ -5963,6 +6164,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " shadow_call_stack:%lukB" #endif " pagetables:%lukB" + " sec_pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5988,6 +6190,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5995,6 +6198,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_populated_zone(zone) { int i; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6056,6 +6261,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); @@ -6085,7 +6292,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "= %lukB\n", K(total)); } - hugetlb_show_meminfo(); + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); @@ -6112,7 +6323,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) do { zone_type--; zone = pgdat->node_zones + zone_type; - if (managed_zone(zone)) { + if (populated_zone(zone)) { zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } @@ -6152,7 +6363,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, } -#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -6199,7 +6409,7 @@ int find_next_best_node(int node, nodemask_t *used_node_mask) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ - val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val *= MAX_NUMNODES; val += node_load[n]; if (val < min_val) { @@ -6265,13 +6475,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; - int node, load, nr_nodes = 0; + int node, nr_nodes = 0; nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = nr_online_nodes; prev_node = local_node; memset(node_order, 0, sizeof(node_order)); @@ -6283,11 +6492,10 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] += 1; node_order[nr_nodes++] = node; prev_node = node; - load--; } build_zonelists_in_node_order(pgdat, node_order, nr_nodes); @@ -6387,9 +6595,8 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; - static DEFINE_SPINLOCK(lock); - spin_lock(&lock); + write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -6402,7 +6609,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); @@ -6422,7 +6633,7 @@ static void __build_all_zonelists(void *data) #endif } - spin_unlock(&lock); + write_sequnlock(&zonelist_update_seq); } static noinline void __init @@ -6573,7 +6784,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * such that unmovable allocations won't be scattered all * over the place during system boot. */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, migratetype); cond_resched(); } @@ -6616,10 +6827,33 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); +} + +/* + * With compound page geometry and when struct pages are stored in ram most + * tail pages are reused. Consequently, the amount of unique struct pages to + * initialize is a lot smaller that the total amount of struct pages being + * mapped. This is a paired / mild layering violation with explicit knowledge + * of how the sparse_vmemmap internals handle compound pages in the lack + * of an altmap. See vmemmap_populate_compound_pages(). + */ +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, + unsigned long nr_pages) +{ + return is_power_of_2(sizeof(struct page)) && + !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages; } static void __ref memmap_init_compound(struct page *head, @@ -6664,7 +6898,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) + if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) return; /* @@ -6686,7 +6920,7 @@ void __ref memmap_init_zone_device(struct zone *zone, continue; memmap_init_compound(page, pfn, zone_idx, nid, pgmap, - pfns_per_compound); + compound_nr_pages(altmap, pfns_per_compound)); } pr_info("%s initialised %lu pages in %ums\n", __func__, @@ -6733,9 +6967,8 @@ static void __init init_unavailable_range(unsigned long spfn, u64 pgcnt = 0; for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; + if (!pfn_valid(pageblock_start_pfn(pfn))) { + pfn = pageblock_end_pfn(pfn) - 1; continue; } __init_single_page(pfn_to_page(pfn), pfn, zone, node); @@ -6840,7 +7073,7 @@ static int zone_batchsize(struct zone *zone) * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -6955,6 +7188,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]); @@ -7024,6 +7258,17 @@ void __meminit setup_zone_pageset(struct zone *zone) } /* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalculated. + */ +static void zone_pcp_update(struct zone *zone, int cpu_online) +{ + mutex_lock(&pcp_batch_high_lock); + zone_set_pageset_high_and_batch(zone, cpu_online); + mutex_unlock(&pcp_batch_high_lock); +} + +/* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. */ @@ -7389,16 +7634,15 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order; + unsigned int order = MAX_ORDER - 1; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; - if (HPAGE_SHIFT > PAGE_SHIFT) + /* Don't let pageblocks exceed the maximum allocation granularity. */ + if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) order = HUGETLB_PAGE_ORDER; - else - order = MAX_ORDER - 1; /* * Assume the largest contiguous order of interest is a huge page. @@ -7468,6 +7712,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) int i; pgdat_resize_init(pgdat); + pgdat_kswapd_lock_init(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); @@ -7502,12 +7747,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, * NOTE: this function is only called during memory hotplug */ #ifdef CONFIG_MEMORY_HOTPLUG -void __ref free_area_init_core_hotplug(int nid) +void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) { + int nid = pgdat->node_id; enum zone_type z; - pg_data_t *pgdat = NODE_DATA(nid); + int cpu; pgdat_init_internals(pgdat); + + if (pgdat->per_cpu_nodestats == &boot_nodestats) + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); + + /* + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly + * when it starts in the near future. + */ + pgdat->nr_zones = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_highest_zoneidx = 0; + pgdat->node_start_pfn = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } + for (z = 0; z < MAX_NR_ZONES; z++) zone_init_internals(&pgdat->node_zones[z], z, nid, 0); } @@ -7657,9 +7923,14 @@ static void __init free_area_init_node(int nid) pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + if (start_pfn != end_pfn) { + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + pr_info("Initmem setup node %d as memoryless\n", nid); + } + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); @@ -7668,7 +7939,7 @@ static void __init free_area_init_node(int nid) free_area_init_core(pgdat); } -void __init free_area_init_memoryless_node(int nid) +static void __init free_area_init_memoryless_node(int nid) { free_area_init_node(nid); } @@ -7736,17 +8007,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/** - * find_min_pfn_with_active_regions - Find the minimum PFN registered - * - * Return: the minimum PFN based on information provided via - * memblock_set_node(). - */ -unsigned long __init find_min_pfn_with_active_regions(void) -{ - return PHYS_PFN(memblock_start_of_DRAM()); -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. @@ -7822,7 +8082,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) usable_startpfn = memblock_region_memory_base_pfn(r); - if (usable_startpfn < 0x100000) { + if (usable_startpfn < PHYS_PFN(SZ_4G)) { mem_below_4gb_not_mirrored = true; continue; } @@ -7972,10 +8232,17 @@ restart: out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ - for (nid = 0; nid < MAX_NUMNODES; nid++) + for (nid = 0; nid < MAX_NUMNODES; nid++) { + unsigned long start_pfn, end_pfn; + zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (zone_movable_pfn[nid] >= end_pfn) + zone_movable_pfn[nid] = 0; + } + out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; @@ -8032,7 +8299,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - start_pfn = find_min_pfn_with_active_regions(); + start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { @@ -8096,8 +8363,36 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + arch_refresh_nodedata(nid, pgdat); + free_area_init_memoryless_node(nid); + + /* + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. + */ + continue; + } + + pgdat = NODE_DATA(nid); free_area_init_node(nid); /* Any memory on that node */ @@ -8253,8 +8548,8 @@ void __init mem_init_print_info(void) #endif ")\n", K(nr_free_pages()), K(physpages), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, + codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, + (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, K(physpages - totalram_pages() - totalcma_pages), K(totalcma_pages) #ifdef CONFIG_HIGHMEM @@ -8284,6 +8579,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); + mlock_page_drain_remote(cpu); drain_pages(cpu); /* @@ -8474,7 +8770,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; spin_unlock_irqrestore(&zone->lock, flags); } @@ -8765,7 +9062,7 @@ void *__init alloc_large_system_hash(const char *tablename, { unsigned long long max = high_limit; unsigned long log2qty, size; - void *table = NULL; + void *table; gfp_t gfp_flags; bool virt; bool huge; @@ -8777,8 +9074,8 @@ void *__init alloc_large_system_hash(const char *tablename, numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ - if (PAGE_SHIFT < 20) - numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + if (PAGE_SIZE < SZ_1M) + numentries = round_up(numentries, SZ_1M / PAGE_SIZE); #if __BITS_PER_LONG > 32 if (!high_limit) { @@ -8834,7 +9131,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags); + table = vmalloc_huge(size, gfp_flags); virt = true; if (table) huge = is_vm_area_hugepages(table); @@ -8864,138 +9161,7 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* - * This function checks whether pageblock includes unmovable pages or not. - * - * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable - * check without lock_page also may miss some movable non-lru pages at - * race condition. So you can't expect this function should be exact. - * - * Returns a page without holding a reference. If the caller wants to - * dereference that page (e.g., dumping), it has to make sure that it - * cannot get removed (e.g., via memory unplug) concurrently. - * - */ -struct page *has_unmovable_pages(struct zone *zone, struct page *page, - int migratetype, int flags) -{ - unsigned long iter = 0; - unsigned long pfn = page_to_pfn(page); - unsigned long offset = pfn % pageblock_nr_pages; - - if (is_migrate_cma_page(page)) { - /* - * CMA allocations (alloc_contig_range) really need to mark - * isolate CMA pageblocks even when they are not movable in fact - * so consider them movable here. - */ - if (is_migrate_cma(migratetype)) - return NULL; - - return page; - } - - for (; iter < pageblock_nr_pages - offset; iter++) { - page = pfn_to_page(pfn + iter); - - /* - * Both, bootmem allocations and memory holes are marked - * PG_reserved and are unmovable. We can even have unmovable - * allocations inside ZONE_MOVABLE, for example when - * specifying "movablecore". - */ - if (PageReserved(page)) - return page; - - /* - * If the zone is movable and we have ruled out all reserved - * pages then it should be reasonably safe to assume the rest - * is movable. - */ - if (zone_idx(zone) == ZONE_MOVABLE) - continue; - - /* - * Hugepages are not in LRU lists, but they're movable. - * THPs are on the LRU, but need to be counted as #small pages. - * We need not scan over tail pages because we don't - * handle each tail page individually in migration. - */ - if (PageHuge(page) || PageTransCompound(page)) { - struct page *head = compound_head(page); - unsigned int skip_pages; - - if (PageHuge(page)) { - if (!hugepage_migration_supported(page_hstate(head))) - return page; - } else if (!PageLRU(head) && !__PageMovable(head)) { - return page; - } - - skip_pages = compound_nr(head) - (page - head); - iter += skip_pages - 1; - continue; - } - - /* - * We can't use page_count without pin a page - * because another CPU can free compound page. - * This check already skips compound tails of THP - * because their page->_refcount is zero at all time. - */ - if (!page_ref_count(page)) { - if (PageBuddy(page)) - iter += (1 << buddy_order(page)) - 1; - continue; - } - - /* - * The HWPoisoned page may be not in buddy system, and - * page_count() is not 0. - */ - if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) - continue; - - /* - * We treat all PageOffline() pages as movable when offlining - * to give drivers a chance to decrement their reference count - * in MEM_GOING_OFFLINE in order to indicate that these pages - * can be offlined as there are no direct references anymore. - * For actually unmovable PageOffline() where the driver does - * not support this, we will fail later when trying to actually - * move these pages that still have a reference count > 0. - * (false negatives in this function only) - */ - if ((flags & MEMORY_OFFLINE) && PageOffline(page)) - continue; - - if (__PageMovable(page) || PageLRU(page)) - continue; - - /* - * If there are RECLAIMABLE pages, we need to check - * it. But now, memory offline itself doesn't call - * shrink_node_slabs() and it still to be fixed. - */ - return page; - } - return NULL; -} - #ifdef CONFIG_CONTIG_ALLOC -static unsigned long pfn_max_align_down(unsigned long pfn) -{ - return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) - 1); -} - -static unsigned long pfn_max_align_up(unsigned long pfn) -{ - return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages)); -} - #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* Usage: See admin-guide/dynamic-debug-howto.rst */ @@ -9018,7 +9184,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list) #endif /* [start, end) must belong to a single zone. */ -static int __alloc_contig_migrate_range(struct compact_control *cc, +int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ @@ -9068,7 +9234,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, lru_cache_enable(); if (ret < 0) { - if (ret == -EBUSY) + if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; @@ -9086,8 +9252,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * be either of the two. * @gfp_mask: GFP mask to use during compaction * - * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned. The PFN range must belong to a single zone. + * The PFN range does not have to be pageblock aligned. The PFN range must + * belong to a single zone. * * The first thing this routine does is attempt to MIGRATE_ISOLATE all * pageblocks in the range. Once isolated, the pageblocks should not @@ -9101,7 +9267,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; - unsigned int order; + int order; int ret = 0; struct compact_control cc = { @@ -9120,14 +9286,11 @@ int alloc_contig_range(unsigned long start, unsigned long end, * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may * have different sizes, and due to the way page allocator - * work, we align the range to biggest of the two pages so - * that page allocator won't try to merge buddies from - * different pageblocks and change MIGRATE_ISOLATE to some - * other migration type. + * work, start_isolate_page_range() has special handlings for this. * * Once the pageblocks are marked as MIGRATE_ISOLATE, we * migrate the pages from an unaligned range (ie. pages that - * we are interested in). This will put all the pages in + * we are interested in). This will put all the pages in * range back to page allocator as MIGRATE_ISOLATE. * * When this is done, we take the pages in range from page @@ -9140,10 +9303,9 @@ int alloc_contig_range(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, 0); + ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); if (ret) - return ret; + goto done; drain_all_pages(cc.zone); @@ -9163,7 +9325,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = 0; /* - * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * Pages from [start, end) are within a pageblock_nr_pages * aligned blocks that are marked as MIGRATE_ISOLATE. What's * more, all pages in [start, end) are free in page allocator. * What we are going to do is to allocate all pages from @@ -9222,8 +9384,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); done: - undo_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype); + undo_isolate_page_range(start, end, migratetype); return ret; } EXPORT_SYMBOL(alloc_contig_range); @@ -9339,17 +9500,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) EXPORT_SYMBOL(free_contig_range); /* - * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalculated. - */ -void zone_pcp_update(struct zone *zone, int cpu_online) -{ - mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone, cpu_online); - mutex_unlock(&pcp_batch_high_lock); -} - -/* * Effectively disable pcplists for the zone by setting the high limit to 0 * and draining all cpus. A concurrent page freeing on another CPU that's about * to put the page on pcplist will either finish before the drain and the page @@ -9381,9 +9531,11 @@ void zone_pcp_reset(struct zone *zone) drain_zonestat(zone, pzstats); } free_percpu(zone->per_cpu_pageset); - free_percpu(zone->per_cpu_zonestats); zone->per_cpu_pageset = &boot_pageset; - zone->per_cpu_zonestats = &boot_zonestats; + if (zone->per_cpu_zonestats != &boot_zonestats) { + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_zonestats = &boot_zonestats; + } } } @@ -9452,6 +9604,7 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } +EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE /* @@ -9541,7 +9694,6 @@ bool put_page_back_buddy(struct page *page) ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { - num_poisoned_pages_dec(); ret = true; } } |