aboutsummaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c1249
1 files changed, 1082 insertions, 167 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5ba5a0da6d57..a1baa198519a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,7 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
+#include <linux/migrate.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -41,6 +42,7 @@
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
+#include "hugetlb_vmemmap.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -48,6 +50,17 @@ struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_page(struct page *page, unsigned int order)
+{
+ return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
+ 1 << order);
+}
+#else
+static bool hugetlb_cma_page(struct page *page, unsigned int order)
+{
+ return false;
+}
#endif
static unsigned long hugetlb_cma_size __initdata;
@@ -64,6 +77,7 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
+static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -319,8 +333,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
- return rg && org &&
- rg->reservation_counter == org->reservation_counter &&
+ return rg->reservation_counter == org->reservation_counter &&
rg->css == org->css;
#else
@@ -433,7 +446,6 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
- VM_BUG_ON(add < 0);
return add;
}
@@ -1002,6 +1014,37 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
vma->vm_private_data = (void *)0;
}
+/*
+ * Reset and decrement one ref on hugepage private reservation.
+ * Called with mm->mmap_sem writer semaphore held.
+ * This function should be only used by move_vma() and operate on
+ * same sized vma. It should never come here with last ref on the
+ * reservation.
+ */
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ /*
+ * Clear the old hugetlb private page reservation.
+ * It has already been transferred to new_vma.
+ *
+ * During a mremap() operation of a hugetlb vma we call move_vma()
+ * which copies vma into new_vma and unmaps vma. After the copy
+ * operation both new_vma and vma share a reference to the resv_map
+ * struct, and at that point vma is about to be unmapped. We don't
+ * want to return the reservation to the pool at unmap of vma because
+ * the reservation still lives on in new_vma, so simply decrement the
+ * ref here and remove the resv_map reference from this vma.
+ */
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
+ kref_put(&reservations->refs, resv_map_release);
+ }
+
+ reset_vma_resv_huge_pages(vma);
+}
+
/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
@@ -1070,6 +1113,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
int nid = page_to_nid(page);
lockdep_assert_held(&hugetlb_lock);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
@@ -1141,7 +1186,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page;
+ struct page *page = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
@@ -1162,7 +1207,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+ if (mpol_is_preferred_many(mpol)) {
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+ /* Fallback to all nodes if page==NULL */
+ nodemask = NULL;
+ }
+
+ if (!page)
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
@@ -1246,9 +1301,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
- unsigned int order)
+/* used to demote non-gigantic_huge pages as well */
+static void __destroy_compound_gigantic_page(struct page *page,
+ unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
@@ -1258,8 +1313,10 @@ static void destroy_compound_gigantic_page(struct page *page,
atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ p->mapping = NULL;
clear_compound_head(p);
- set_page_refcounted(p);
+ if (!demote)
+ set_page_refcounted(p);
}
set_compound_order(page, 0);
@@ -1267,6 +1324,19 @@ static void destroy_compound_gigantic_page(struct page *page,
__ClearPageHead(page);
}
+static void destroy_compound_hugetlb_page_for_demote(struct page *page,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_page(page, order, true);
+}
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_page(page, order, false);
+}
+
static void free_gigantic_page(struct page *page, unsigned int order)
{
/*
@@ -1318,8 +1388,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
@@ -1341,12 +1409,15 @@ static inline void destroy_compound_gigantic_page(struct page *page,
/*
* Remove hugetlb page from lists, and update dtor so that page appears
- * as just a compound page. A reference is held on the page.
+ * as just a compound page.
+ *
+ * A reference is held on the page, except in the case of demote.
*
* Must be called with hugetlb lock held.
*/
-static void remove_hugetlb_page(struct hstate *h, struct page *page,
- bool adjust_surplus)
+static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus,
+ bool demote)
{
int nid = page_to_nid(page);
@@ -1368,14 +1439,92 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->surplus_huge_pages_node[nid]--;
}
- set_page_refcounted(page);
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ /*
+ * Very subtle
+ *
+ * For non-gigantic pages set the destructor to the normal compound
+ * page dtor. This is needed in case someone takes an additional
+ * temporary ref to the page, and freeing is delayed until they drop
+ * their reference.
+ *
+ * For gigantic pages set the destructor to the null dtor. This
+ * destructor will never be called. Before freeing the gigantic
+ * page destroy_compound_gigantic_page will turn the compound page
+ * into a simple group of pages. After this the destructor does not
+ * apply.
+ *
+ * This handles the case where more than one ref is held when and
+ * after update_and_free_page is called.
+ *
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
+ */
+ if (!demote)
+ set_page_refcounted(page);
+ if (hstate_is_gigantic(h))
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ else
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_page(h, page, adjust_surplus, false);
+}
+
+static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_page(h, page, adjust_surplus, true);
+}
+
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ int zeroed;
+ int nid = page_to_nid(page);
+
+ VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ INIT_LIST_HEAD(&page->lru);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+
+ if (adjust_surplus) {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[nid]++;
+ }
+
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_page_private(page, 0);
+ SetHPageVmemmapOptimized(page);
+
+ /*
+ * This page is about to be managed by the hugetlb allocator and
+ * should have no users. Drop our reference, and check for others
+ * just in case.
+ */
+ zeroed = put_page_testzero(page);
+ if (!zeroed)
+ /*
+ * It is VERY unlikely soneone else has taken a ref on
+ * the page. In this case, we simply return as the
+ * hugetlb destructor (free_huge_page) will be called
+ * when this other ref is dropped.
+ */
+ return;
+
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+}
+
+static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
struct page *subpage = page;
@@ -1383,6 +1532,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
+ if (alloc_huge_page_vmemmap(h, page)) {
+ spin_lock_irq(&hugetlb_lock);
+ /*
+ * If we cannot allocate vmemmap pages, just refuse to free the
+ * page and put the page back on the hugetlb free list and treat
+ * as a surplus page.
+ */
+ add_hugetlb_page(h, page, true);
+ spin_unlock_irq(&hugetlb_lock);
+ return;
+ }
+
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1390,7 +1551,13 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
- if (hstate_is_gigantic(h)) {
+
+ /*
+ * Non-gigantic pages demoted from CMA allocated gigantic pages
+ * need to be given back to CMA in free_gigantic_page.
+ */
+ if (hstate_is_gigantic(h) ||
+ hugetlb_cma_page(page, huge_page_order(h))) {
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
@@ -1398,12 +1565,79 @@ static void update_and_free_page(struct hstate *h, struct page *page)
}
}
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ struct page *page;
+ struct hstate *h;
+
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ page->mapping = NULL;
+ /*
+ * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+ * is going to trigger because a previous call to
+ * remove_hugetlb_page() will set_compound_page_dtor(page,
+ * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ */
+ h = size_to_hstate(page_size(page));
+
+ __update_and_free_page(h, page);
+
+ cond_resched();
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+ if (free_vmemmap_pages_per_hpage(h))
+ flush_work(&free_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+ bool atomic)
+{
+ if (!HPageVmemmapOptimized(page) || !atomic) {
+ __update_and_free_page(h, page);
+ return;
+ }
+
+ /*
+ * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+ *
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the workfn
+ * hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ schedule_work(&free_hpage_work);
+}
+
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page);
+ update_and_free_page(h, page, false);
cond_resched();
}
}
@@ -1470,12 +1704,12 @@ void free_huge_page(struct page *page)
if (HPageTemporary(page)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
remove_hugetlb_page(h, page, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
@@ -1493,8 +1727,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_huge_page(struct page *page)
+static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
+ free_huge_page_vmemmap(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -1504,15 +1739,16 @@ static void __prep_new_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
- __prep_new_huge_page(page);
+ __prep_new_huge_page(h, page);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
-static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
+ bool demote)
{
- int i;
+ int i, j;
int nr_pages = 1 << order;
struct page *p = page + 1;
@@ -1534,11 +1770,62 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* after get_user_pages().
*/
__ClearPageReserved(p);
- set_page_count(p, 0);
+ /*
+ * Subtle and very unlikely
+ *
+ * Gigantic 'page allocators' such as memblock or cma will
+ * return a set of pages with each page ref counted. We need
+ * to turn this set of pages into a compound page with tail
+ * page ref counts set to zero. Code such as speculative page
+ * cache adding could take a ref on a 'to be' tail page.
+ * We need to respect any increased ref count, and only set
+ * the ref count to zero if count is currently 1. If count
+ * is not 1, we return an error. An error return indicates
+ * the set of pages can not be converted to a gigantic page.
+ * The caller who allocated the pages should then discard the
+ * pages using the appropriate free interface.
+ *
+ * In the case of demote, the ref count will be zero.
+ */
+ if (!demote) {
+ if (!page_ref_freeze(p, 1)) {
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ goto out_error;
+ }
+ } else {
+ VM_BUG_ON_PAGE(page_count(p), p);
+ }
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
+ return true;
+
+out_error:
+ /* undo tail page modifications made above */
+ p = page + 1;
+ for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
+ clear_compound_head(p);
+ set_page_refcounted(p);
+ }
+ /* need to clear PG_reserved on remaining tail pages */
+ for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ __ClearPageReserved(p);
+ set_compound_order(page, 0);
+ page[1].compound_nr = 0;
+ __ClearPageHead(page);
+ return false;
+}
+
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+{
+ return __prep_compound_gigantic_page(page, order, false);
+}
+
+static bool prep_compound_gigantic_page_for_demote(struct page *page,
+ unsigned int order)
+{
+ return __prep_compound_gigantic_page(page, order, true);
}
/*
@@ -1658,7 +1945,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
struct page *page;
+ bool retry = false;
+retry:
if (hstate_is_gigantic(h))
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
@@ -1667,8 +1956,20 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
if (!page)
return NULL;
- if (hstate_is_gigantic(h))
- prep_compound_gigantic_page(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ /*
+ * Rare failure to convert pages to compound page.
+ * Free pages and try again - ONCE!
+ */
+ free_gigantic_page(page, huge_page_order(h));
+ if (!retry) {
+ retry = true;
+ goto retry;
+ }
+ return NULL;
+ }
+ }
prep_new_huge_page(h, page, page_to_nid(page));
return page;
@@ -1737,10 +2038,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
- * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- * (allocated or reserved.)
- * 0: successfully dissolved free hugepages or the page is not a
- * hugepage (considered as already dissolved)
+ * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ * when the system is under memory pressure and the feature of
+ * freeing unused vmemmap pages associated with each hugetlb page
+ * is enabled.
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
@@ -1782,19 +2087,38 @@ retry:
goto retry;
}
- /*
- * Move PageHWPoison flag from head page to the raw error page,
- * which makes any subpages rather than the error page reusable.
- */
- if (PageHWPoison(head) && page != head) {
- SetPageHWPoison(page);
- ClearPageHWPoison(head);
- }
remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, head);
- return 0;
+
+ /*
+ * Normally update_and_free_page will allocate required vmemmmap
+ * before freeing the page. update_and_free_page will fail to
+ * free the page if it can not allocate required vmemmap. We
+ * need to adjust max_huge_pages if the page is not freed.
+ * Attempt to allocate vmemmmap here so that we can take
+ * appropriate action on failure.
+ */
+ rc = alloc_huge_page_vmemmap(h, head);
+ if (!rc) {
+ /*
+ * Move PageHWPoison flag from head page to the raw
+ * error page, which makes any subpages rather than
+ * the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
+ update_and_free_page(h, head, false);
+ } else {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_page(h, head, false);
+ h->max_huge_pages++;
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
+ return rc;
}
out:
spin_unlock_irq(&hugetlb_lock);
@@ -1832,9 +2156,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+ int nid, nodemask_t *nmask, bool zero_ref)
{
struct page *page = NULL;
+ bool retry = false;
if (hstate_is_gigantic(h))
return NULL;
@@ -1844,6 +2169,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
+retry:
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -1861,11 +2187,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
spin_unlock_irq(&hugetlb_lock);
put_page(page);
return NULL;
- } else {
- h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
}
+ if (zero_ref) {
+ /*
+ * Caller requires a page with zero ref count.
+ * We will drop ref count here. If someone else is holding
+ * a ref, the page will be freed when they drop it. Abuse
+ * temporary page flag to accomplish this.
+ */
+ SetHPageTemporary(page);
+ if (!put_page_testzero(page)) {
+ /*
+ * Unexpected inflated ref count on freshly allocated
+ * huge. Retry once.
+ */
+ pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+ spin_unlock_irq(&hugetlb_lock);
+ if (retry)
+ return NULL;
+
+ retry = true;
+ goto retry;
+ }
+ ClearHPageTemporary(page);
+ }
+
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[page_to_nid(page)]++;
+
out_unlock:
spin_unlock_irq(&hugetlb_lock);
@@ -1900,16 +2250,26 @@ static
struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
+ struct page *page = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
- mpol_cond_put(mpol);
+ if (mpol_is_preferred_many(mpol)) {
+ gfp_t gfp = gfp_mask | __GFP_NOWARN;
+
+ gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+
+ /* Fallback to all nodes if page==NULL */
+ nodemask = NULL;
+ }
+ if (!page)
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+ mpol_cond_put(mpol);
return page;
}
@@ -1979,7 +2339,7 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
- NUMA_NO_NODE, NULL);
+ NUMA_NO_NODE, NULL, true);
if (!page) {
alloc_ok = false;
break;
@@ -2020,24 +2380,20 @@ retry:
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- int zeroed;
-
if ((--needed) < 0)
break;
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- zeroed = put_page_testzero(page);
- VM_BUG_ON_PAGE(!zeroed, page);
+ /* Add the page to the hugetlb allocator */
enqueue_huge_page(h, page);
}
free:
spin_unlock_irq(&hugetlb_lock);
- /* Free unnecessary surplus pages to the buddy allocator */
+ /*
+ * Free unnecessary surplus pages to the buddy allocator.
+ * Pages have no ref count, call free_huge_page directly.
+ */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
- put_page(page);
+ free_huge_page(page);
spin_lock_irq(&hugetlb_lock);
return ret;
@@ -2288,7 +2644,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * added by alloc_huge_page. We know it was added
+ * not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
@@ -2346,19 +2702,42 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = page_to_nid(old_page);
+ bool alloc_retry = false;
struct page *new_page;
int ret = 0;
/*
* Before dissolving the page, we need to allocate a new one for the
- * pool to remain stable. Using alloc_buddy_huge_page() allows us to
- * not having to deal with prep_new_huge_page() and avoids dealing of any
- * counters. This simplifies and let us do the whole thing under the
- * lock.
+ * pool to remain stable. Here, we allocate the page and 'prep' it
+ * by doing everything but actually updating counters and adding to
+ * the pool. This simplifies and let us do most of the processing
+ * under the lock.
*/
+alloc_retry:
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
+ /*
+ * If all goes well, this page will be directly added to the free
+ * list in the pool. For this the ref count needs to be zero.
+ * Attempt to drop now, and retry once if needed. It is VERY
+ * unlikely there is another ref on the page.
+ *
+ * If someone else has a reference to the page, it will be freed
+ * when they drop their ref. Abuse temporary page flag to accomplish
+ * this. Retry once if there is an inflated ref count.
+ */
+ SetHPageTemporary(new_page);
+ if (!put_page_testzero(new_page)) {
+ if (alloc_retry)
+ return -EBUSY;
+
+ alloc_retry = true;
+ goto alloc_retry;
+ }
+ ClearHPageTemporary(new_page);
+
+ __prep_new_huge_page(h, new_page);
retry:
spin_lock_irq(&hugetlb_lock);
@@ -2397,30 +2776,26 @@ retry:
remove_hugetlb_page(h, old_page, false);
/*
- * new_page needs to be initialized with the standard hugetlb
- * state. This is normally done by prep_new_huge_page() but
- * that takes hugetlb_lock which is already held so we need to
- * open code it here.
- * Reference count trick is needed because allocator gives us
- * referenced page but the pool requires pages with 0 refcount.
+ * Ref count on new page is already zero as it was dropped
+ * earlier. It can be directly added to the pool free list.
*/
- __prep_new_huge_page(new_page);
__prep_account_new_huge_page(h, nid);
- page_ref_dec(new_page);
enqueue_huge_page(h, new_page);
/*
* Pages have been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page);
+ update_and_free_page(h, old_page, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
- __free_pages(new_page, huge_page_order(h));
+ /* Page has a zero ref count, but needs a ref to be freed */
+ set_page_refcounted(new_page);
+ update_and_free_page(h, new_page, false);
return ret;
}
@@ -2591,33 +2966,39 @@ out_subpool_put:
return ERR_PTR(-ENOSPC);
}
-int alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
-int __alloc_bootmem_huge_page(struct hstate *h)
+int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m = NULL; /* initialize for clang */
int nr_nodes, node;
+ if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
+ return 0;
+ /* do node specific alloc */
+ if (nid != NUMA_NO_NODE) {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!m)
+ return 0;
+ goto found;
+ }
+ /* allocate from next node when distributing huge pages */
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
- void *addr;
-
- addr = memblock_alloc_try_nid_raw(
+ m = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- if (addr) {
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
- m = addr;
- goto found;
- }
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ if (!m)
+ return 0;
+ goto found;
}
- return 0;
found:
- BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
@@ -2625,16 +3006,10 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page,
- unsigned int order)
-{
- if (unlikely(order > (MAX_ORDER - 1)))
- prep_compound_gigantic_page(page, order);
- else
- prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
@@ -2643,29 +3018,81 @@ static void __init gather_bootmem_prealloc(void)
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, huge_page_order(h));
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* free it into the hugepage allocator */
+ if (prep_compound_gigantic_page(page, huge_page_order(h))) {
+ WARN_ON(PageReserved(page));
+ prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* add to the hugepage allocator */
+ } else {
+ /* VERY unlikely inflated ref count on a tail page */
+ free_gigantic_page(page, huge_page_order(h));
+ }
/*
- * If we had gigantic hugepages allocated at boot time, we need
- * to restore the 'stolen' pages to totalram_pages in order to
- * fix confusing memory reports from free(1) and another
- * side-effects, like CommitLimit going negative.
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ adjust_managed_page_count(page, pages_per_huge_page(h));
+ cond_resched();
+ }
+}
+static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
+{
+ unsigned long i;
+ char buf[32];
+
+ for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (!alloc_bootmem_huge_page(h, nid))
+ break;
+ } else {
+ struct page *page;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
+ if (!page)
+ break;
+ put_page(page); /* free it into the hugepage allocator */
+ }
cond_resched();
}
+ if (i == h->max_huge_pages_node[nid])
+ return;
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
+ h->max_huge_pages_node[nid], buf, nid, i);
+ h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
+ h->max_huge_pages_node[nid] = i;
}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
nodemask_t *node_alloc_noretry;
+ bool node_specific_alloc = false;
+
+ /* skip gigantic hugepages allocation if hugetlb_cma enabled */
+ if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ return;
+ }
+ /* do node specific alloc */
+ for (i = 0; i < nr_online_nodes; i++) {
+ if (h->max_huge_pages_node[i] > 0) {
+ hugetlb_hstate_alloc_pages_onenode(h, i);
+ node_specific_alloc = true;
+ }
+ }
+
+ if (node_specific_alloc)
+ return;
+
+ /* below will do all node balanced alloc */
if (!hstate_is_gigantic(h)) {
/*
* Bit mask controlling how hard we retry per-node allocations.
@@ -2686,11 +3113,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
- if (hugetlb_cma_size) {
- pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
- goto free;
- }
- if (!alloc_bootmem_huge_page(h))
+ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
break;
} else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY],
@@ -2706,13 +3129,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-free:
kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
{
- struct hstate *h;
+ struct hstate *h, *h2;
for_each_hstate(h) {
if (minimum_order > huge_page_order(h))
@@ -2721,6 +3143,26 @@ static void __init hugetlb_init_hstates(void)
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
+
+ /*
+ * Set demote order for each hstate. Note that
+ * h->demote_order is initially 0.
+ * - We can not demote gigantic pages if runtime freeing
+ * is not supported, so skip this.
+ * - If CMA allocation is possible, we can not demote
+ * HUGETLB_PAGE_ORDER or smaller size pages.
+ */
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ continue;
+ if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ continue;
+ for_each_hstate(h2) {
+ if (h2 == h)
+ continue;
+ if (h2->order < h->order &&
+ h2->order > h->demote_order)
+ h->demote_order = h2->order;
+ }
}
VM_BUG_ON(minimum_order == UINT_MAX);
}
@@ -2834,6 +3276,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* pages in hstate via the proc/sysfs interfaces.
*/
mutex_lock(&h->resize_lock);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
/*
@@ -2943,6 +3386,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
while (count < persistent_huge_pages(h)) {
@@ -2959,9 +3403,100 @@ out:
return 0;
}
+static int demote_free_huge_page(struct hstate *h, struct page *page)
+{
+ int i, nid = page_to_nid(page);
+ struct hstate *target_hstate;
+ int rc = 0;
+
+ target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
+
+ remove_hugetlb_page_for_demote(h, page, false);
+ spin_unlock_irq(&hugetlb_lock);
+
+ rc = alloc_huge_page_vmemmap(h, page);
+ if (rc) {
+ /* Allocation of vmemmmap failed, we can not demote page */
+ spin_lock_irq(&hugetlb_lock);
+ set_page_refcounted(page);
+ add_hugetlb_page(h, page, false);
+ return rc;
+ }
+
+ /*
+ * Use destroy_compound_hugetlb_page_for_demote for all huge page
+ * sizes as it will not ref count pages.
+ */
+ destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
+
+ /*
+ * Taking target hstate mutex synchronizes with set_max_huge_pages.
+ * Without the mutex, pages added to target hstate could be marked
+ * as surplus.
+ *
+ * Note that we already hold h->resize_lock. To prevent deadlock,
+ * use the convention of always taking larger size hstate mutex first.
+ */
+ mutex_lock(&target_hstate->resize_lock);
+ for (i = 0; i < pages_per_huge_page(h);
+ i += pages_per_huge_page(target_hstate)) {
+ if (hstate_is_gigantic(target_hstate))
+ prep_compound_gigantic_page_for_demote(page + i,
+ target_hstate->order);
+ else
+ prep_compound_page(page + i, target_hstate->order);
+ set_page_private(page + i, 0);
+ set_page_refcounted(page + i);
+ prep_new_huge_page(target_hstate, page + i, nid);
+ put_page(page + i);
+ }
+ mutex_unlock(&target_hstate->resize_lock);
+
+ spin_lock_irq(&hugetlb_lock);
+
+ /*
+ * Not absolutely necessary, but for consistency update max_huge_pages
+ * based on pool changes for the demoted page.
+ */
+ h->max_huge_pages--;
+ target_hstate->max_huge_pages += pages_per_huge_page(h);
+
+ return rc;
+}
+
+static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+ __must_hold(&hugetlb_lock)
+{
+ int nr_nodes, node;
+ struct page *page;
+ int rc = 0;
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ /* We should never get here if no demote order */
+ if (!h->demote_order) {
+ pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
+ return -EINVAL; /* internal error */
+ }
+
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (!list_empty(&h->hugepage_freelists[node])) {
+ page = list_entry(h->hugepage_freelists[node].next,
+ struct page, lru);
+ rc = demote_free_huge_page(h, page);
+ break;
+ }
+ }
+
+ return rc;
+}
+
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR_WO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
#define HSTATE_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
@@ -3157,6 +3692,103 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
}
HSTATE_ATTR_RO(surplus_hugepages);
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err = 0;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ if (nid != NUMA_NO_NODE) {
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ n_mask = &node_states[N_MEMORY];
+ }
+
+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
+
+ while (nr_demote) {
+ /*
+ * Check for available pages to demote each time thorough the
+ * loop as demote_pool_huge_page will drop hugetlb_lock.
+ */
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (!nr_available)
+ break;
+
+ err = demote_pool_huge_page(h, n_mask);
+ if (err)
+ break;
+
+ nr_demote--;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ if (err)
+ return err;
+ return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nid;
+ struct hstate *h = kobj_to_hstate(kobj, &nid);
+ unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
+
+ return sysfs_emit(buf, "%lukB\n", demote_size);
+}
+
+static ssize_t demote_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hstate *h, *demote_hstate;
+ unsigned long demote_size;
+ unsigned int demote_order;
+ int nid;
+
+ demote_size = (unsigned long)memparse(buf, NULL);
+
+ demote_hstate = size_to_hstate(demote_size);
+ if (!demote_hstate)
+ return -EINVAL;
+ demote_order = demote_hstate->order;
+ if (demote_order < HUGETLB_PAGE_ORDER)
+ return -EINVAL;
+
+ /* demote order must be smaller than hstate order */
+ h = kobj_to_hstate(kobj, &nid);
+ if (demote_order >= h->order)
+ return -EINVAL;
+
+ /* resize_lock synchronizes access to demote size and writes */
+ mutex_lock(&h->resize_lock);
+ h->demote_order = demote_order;
+ mutex_unlock(&h->resize_lock);
+
+ return count;
+}
+HSTATE_ATTR(demote_size);
+
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
@@ -3173,6 +3805,16 @@ static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
+static struct attribute *hstate_demote_attrs[] = {
+ &demote_size_attr.attr,
+ &demote_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group hstate_demote_attr_group = {
+ .attrs = hstate_demote_attrs,
+};
+
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
const struct attribute_group *hstate_attr_group)
@@ -3190,6 +3832,12 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
hstate_kobjs[hi] = NULL;
}
+ if (h->demote_order) {
+ if (sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group))
+ pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ }
+
return retval;
}
@@ -3395,6 +4043,10 @@ static int __init hugetlb_init(void)
}
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
+
+ for (i = 0; i < nr_online_nodes; i++)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
}
}
@@ -3450,10 +4102,15 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ hugetlb_vmemmap_init(h);
parsed_hstate = h;
}
+bool __init __weak hugetlb_node_alloc_supported(void)
+{
+ return true;
+}
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -3465,6 +4122,10 @@ static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
+ int node = NUMA_NO_NODE;
+ int count;
+ unsigned long tmp;
+ char *p = s;
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
@@ -3488,8 +4149,40 @@ static int __init hugepages_setup(char *s)
return 0;
}
- if (sscanf(s, "%lu", mhp) <= 0)
- *mhp = 0;
+ while (*p) {
+ count = 0;
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ /* Parameter is node format */
+ if (p[count] == ':') {
+ if (!hugetlb_node_alloc_supported()) {
+ pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
+ return 0;
+ }
+ node = tmp;
+ p += count + 1;
+ if (node < 0 || node >= nr_online_nodes)
+ goto invalid;
+ /* Parse hugepages */
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ if (!hugetlb_max_hstate)
+ default_hugepages_in_node[node] = tmp;
+ else
+ parsed_hstate->max_huge_pages_node[node] = tmp;
+ *mhp += tmp;
+ /* Go to parse next node*/
+ if (p[count] == ',')
+ p += count + 1;
+ else
+ break;
+ } else {
+ if (p != s)
+ goto invalid;
+ *mhp = tmp;
+ break;
+ }
+ }
/*
* Global state is always initialized later in hugetlb_init.
@@ -3502,6 +4195,10 @@ static int __init hugepages_setup(char *s)
last_mhp = mhp;
return 1;
+
+invalid:
+ pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
+ return 0;
}
__setup("hugepages=", hugepages_setup);
@@ -3563,6 +4260,7 @@ __setup("hugepagesz=", hugepagesz_setup);
static int __init default_hugepagesz_setup(char *s)
{
unsigned long size;
+ int i;
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
@@ -3591,6 +4289,9 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ for (i = 0; i < nr_online_nodes; i++)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
if (hstate_is_gigantic(&default_hstate))
hugetlb_hstate_alloc_pages(&default_hstate);
default_hstate_max_huge_pages = 0;
@@ -3849,8 +4550,10 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
@@ -3924,6 +4627,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -3934,7 +4638,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
- entry = arch_make_huge_pte(entry, vma, page, writable);
+ entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
}
@@ -4057,12 +4761,13 @@ again:
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
- if (is_write_migration_entry(swp_entry) && cow) {
+ if (is_writable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&swp_entry);
+ swp_entry = make_readable_migration_entry(
+ swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
@@ -4145,9 +4850,85 @@ again:
return ret;
}
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page)
+static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pte_t *src_pte)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *dst_pte, pte;
+ spinlock_t *src_ptl, *dst_ptl;
+
+ dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
+ dst_ptl = huge_pte_lock(h, mm, dst_pte);
+ src_ptl = huge_pte_lockptr(h, mm, src_pte);
+
+ /*
+ * We don't have to worry about the ordering of src and dst ptlocks
+ * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
+ */
+ if (src_ptl != dst_ptl)
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ set_huge_pte_at(mm, new_addr, dst_pte, pte);
+
+ if (src_ptl != dst_ptl)
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+}
+
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+ struct vm_area_struct *new_vma,
+ unsigned long old_addr, unsigned long new_addr,
+ unsigned long len)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_end = old_addr + len;
+ unsigned long old_addr_copy;
+ pte_t *src_pte, *dst_pte;
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
+ old_end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
+ /* Prevent race with file truncation */
+ i_mmap_lock_write(mapping);
+ for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
+ src_pte = huge_pte_offset(mm, old_addr, sz);
+ if (!src_pte)
+ continue;
+ if (huge_pte_none(huge_ptep_get(src_pte)))
+ continue;
+
+ /* old_addr arg to huge_pmd_unshare() is a pointer and so the
+ * arg may be modified. Pass a copy instead to preserve the
+ * value in old_addr.
+ */
+ old_addr_copy = old_addr;
+
+ if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+ continue;
+
+ dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
+ if (!dst_pte)
+ break;
+
+ move_huge_pte(vma, old_addr, new_addr, src_pte);
+ }
+ flush_tlb_range(vma, old_end - len, old_end);
+ mmu_notifier_invalidate_range_end(&range);
+ i_mmap_unlock_write(mapping);
+
+ return len + old_addr - old_end;
+}
+
+static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -4158,6 +4939,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
+ bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -4186,10 +4968,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
- /*
- * We just unmapped a page of PMDs by clearing a PUD.
- * The caller's TLB flush range should cover this area.
- */
+ tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+ force_flush = true;
continue;
}
@@ -4246,6 +5026,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
+
+ /*
+ * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
+ * could defer the flush until now, since by holding i_mmap_rwsem we
+ * guaranteed that the last refernece would not be dropped. But we must
+ * do the flushing before we return, as otherwise i_mmap_rwsem will be
+ * dropped and the last reference to the shared PMDs page might be
+ * dropped as well.
+ *
+ * In theory we could defer the freeing of the PMD pages as well, but
+ * huge_pmd_unshare() relies on the exact page_count for the PMD page to
+ * detect sharing, so we cannot defer the release of the page either.
+ * Instead, do flush now.
+ */
+ if (force_flush)
+ tlb_flush_mmu_tlbonly(tlb);
}
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -4335,7 +5131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
- * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * Called with hugetlb_fault_mutex_table held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
@@ -4474,7 +5270,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
+ /* No restore in case of successful pagetable update (Break COW) */
+ if (new_page != old_page)
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -4590,7 +5388,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -4613,6 +5411,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
retry:
+ new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
@@ -4656,6 +5455,7 @@ retry:
goto retry;
goto out;
}
+ new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -4740,7 +5540,9 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
+ /* restore reserve for newly allocated pages not in page cache */
+ if (new_page && !new_pagecache_page)
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -4939,25 +5741,24 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
- struct address_space *mapping;
- pgoff_t idx;
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
+ int ret = -ENOMEM;
struct page *page;
int writable;
-
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+ bool page_in_pagecache = false;
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
if (!page)
goto out;
+ page_in_pagecache = true;
} else if (!*pagep) {
/* If a page already exists, then it's UFFDIO_COPY for
* a non-missing case. Return -EEXIST.
@@ -4981,12 +5782,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
+ /* Free the allocated page which may have
+ * consumed a reservation.
+ */
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ put_page(page);
+
+ /* Allocate a temporary page to hold the copied
+ * contents.
+ */
+ page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
*pagep = page;
- /* don't free the page */
+ /* Set the outparam pagep and return to the caller to
+ * copy the contents outside the lock. Don't free the
+ * page.
+ */
goto out;
}
} else {
- page = *pagep;
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ put_page(*pagep);
+ ret = -EEXIST;
+ *pagep = NULL;
+ goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ *pagep = NULL;
+ goto out;
+ }
+ folio_copy(page_folio(page), page_folio(*pagep));
+ put_page(*pagep);
*pagep = NULL;
}
@@ -5013,6 +5846,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
+ page_in_pagecache = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5076,7 +5910,8 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!page_in_pagecache)
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
@@ -5225,8 +6060,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
continue;
}
- refs = min3(pages_per_huge_page(h) - pfn_offset,
- (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+ /* vaddr may not be aligned to PAGE_SIZE */
+ refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+ (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
record_subpages_vmas(mem_map_offset(page, pfn_offset),
@@ -5318,10 +6154,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
pte_t newpte;
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
@@ -5332,10 +6169,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (!huge_pte_none(pte)) {
pte_t old_pte;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
- pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
@@ -5643,12 +6481,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* sharing is possible. For hugetlbfs, this prevents removal of any page
* table entries associated with the address space. This is important as we
* are setting up sharing based on existing page table entries (mappings).
- *
- * NOTE: This routine is only called from huge_pte_alloc. Some callers of
- * huge_pte_alloc know that sharing is not possible and do not take
- * i_mmap_rwsem as a performance optimization. This is handled by the
- * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
- * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
@@ -5938,6 +6770,8 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
*hugetlb = true;
if (HPageFreed(page) || HPageMigratable(page))
ret = get_page_unless_zero(page);
+ else
+ ret = -EBUSY;
}
spin_unlock_irq(&hugetlb_lock);
return ret;
@@ -6047,7 +6881,38 @@ static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
{
- hugetlb_cma_size = memparse(p, &p);
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ nid = tmp;
+ if (nid < 0 || nid >= MAX_NUMNODES)
+ break;
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ hugetlb_cma_size_in_node[nid] = tmp;
+ hugetlb_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ hugetlb_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
return 0;
}
@@ -6056,6 +6921,7 @@ early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
void __init hugetlb_cma_reserve(int order)
{
unsigned long size, reserved, per_node;
+ bool node_specific_cma_alloc = false;
int nid;
cma_reserve_called = true;
@@ -6063,30 +6929,72 @@ void __init hugetlb_cma_reserve(int order)
if (!hugetlb_cma_size)
return;
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_state(nid, N_ONLINE)) {
+ pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+ nid, (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!hugetlb_cma_size)
+ return;
+
if (hugetlb_cma_size < (PAGE_SIZE << order)) {
pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
(PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size = 0;
return;
}
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ }
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
char name[CMA_MAX_NAME];
- size = min(per_node, hugetlb_cma_size - reserved);
+ if (node_specific_cma_alloc) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = hugetlb_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, hugetlb_cma_size - reserved);
+ }
+
size = round_up(size, PAGE_SIZE << order);
snprintf(name, sizeof(name), "hugetlb%d", nid);
- res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_nid(0, size, 0,
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
@@ -6102,6 +7010,13 @@ void __init hugetlb_cma_reserve(int order)
if (reserved >= hugetlb_cma_size)
break;
}
+
+ if (!reserved)
+ /*
+ * hugetlb_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ hugetlb_cma_size = 0;
}
void __init hugetlb_cma_check(void)