aboutsummaryrefslogtreecommitdiff
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h615
1 files changed, 418 insertions, 197 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 34f9dba17c1a..da5219b48d52 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -319,11 +319,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -339,14 +341,27 @@ extern unsigned int kobjsize(const void *objp);
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+/*
+ * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+ * support core mm.
+ *
+ * These VMAs will get a single end guard page. This helps userspace protect
+ * itself from attacks. A single page is enough for current shadow stack archs
+ * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
+ * for more details on the guard size.
+ */
+# define VM_SHADOW_STACK VM_HIGH_ARCH_5
+#else
+# define VM_SHADOW_STACK VM_NONE
+#endif
+
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP VM_ARCH_1
-#elif defined(CONFIG_IA64)
-# define VM_GROWSUP VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR VM_SPARC_ADI
@@ -370,7 +385,7 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 37
+# define VM_UFFD_MINOR_BIT 38
# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR VM_NONE
@@ -397,6 +412,8 @@ extern unsigned int kobjsize(const void *objp);
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif
+#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
+
#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK VM_GROWSUP
#define VM_STACK_EARLY VM_GROWSDOWN
@@ -532,13 +549,6 @@ struct vm_fault {
*/
};
-/* page entry size for vm->huge_fault() */
-enum page_entry_size {
- PE_SIZE_PTE = 0,
- PE_SIZE_PMD,
- PE_SIZE_PUD,
-};
-
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -562,8 +572,7 @@ struct vm_operations_struct {
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long newflags);
vm_fault_t (*fault)(struct vm_fault *vmf);
- vm_fault_t (*huge_fault)(struct vm_fault *vmf,
- enum page_entry_size pe_size);
+ vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
@@ -608,7 +617,7 @@ struct vm_operations_struct {
* policy.
*/
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
- unsigned long addr);
+ unsigned long addr, pgoff_t *ilx);
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
@@ -679,6 +688,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
rcu_read_unlock();
}
+/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@@ -691,6 +701,11 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
return (vma->vm_lock_seq == *mm_lock_seq);
}
+/*
+ * Begin writing to a VMA.
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ */
static inline void vma_start_write(struct vm_area_struct *vma)
{
int mm_lock_seq;
@@ -709,26 +724,17 @@ static inline void vma_start_write(struct vm_area_struct *vma)
up_write(&vma->vm_lock->lock);
}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
int mm_lock_seq;
- if (__is_vma_write_locked(vma, &mm_lock_seq))
- return true;
-
- if (!down_write_trylock(&vma->vm_lock->lock))
- return false;
-
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock->lock);
- return true;
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ if (!rwsem_is_locked(&vma->vm_lock->lock))
+ vma_assert_write_locked(vma);
}
static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
@@ -739,6 +745,22 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
vma->detached = detached;
}
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ vma_end_read(vmf->vma);
+ else
+ mmap_read_unlock(vmf->vma->vm_mm);
+}
+
+static inline void assert_fault_locked(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ vma_assert_locked(vmf->vma);
+ else
+ mmap_assert_locked(vmf->vma->vm_mm);
+}
+
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address);
@@ -748,25 +770,40 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
{ return false; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
- { return true; }
-static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+ { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_mark_detached(struct vm_area_struct *vma,
bool detached) {}
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ return NULL;
+}
+
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+ mmap_read_unlock(vmf->vma->vm_mm);
+}
+
+static inline void assert_fault_locked(struct vm_fault *vmf)
+{
+ mmap_assert_locked(vmf->vma->vm_mm);
+}
+
#endif /* CONFIG_PER_VMA_LOCK */
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+
/*
* WARNING: vma_init does not initialize vma->vm_lock.
* Use vm_area_alloc()/vm_area_free() if vma needs locking.
*/
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
- static const struct vm_operations_struct dummy_vm_ops = {};
-
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
- vma->vm_ops = &dummy_vm_ops;
+ vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma_mark_detached(vma, false);
vma_numab_state_init(vma);
@@ -779,18 +816,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
ACCESS_PRIVATE(vma, __vm_flags) = flags;
}
-/* Use when VMA is part of the VMA tree and modifications need coordination */
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- vma_start_write(vma);
+ vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
- vma_start_write(vma);
+ vma_assert_write_locked(vma);
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}
@@ -839,6 +880,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
return !vma->vm_ops;
}
+/*
+ * Indicate if the VMA is a heap for the given task; for
+ * /proc/PID/maps that is the heap of the main task.
+ */
+static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
+{
+ return vma->vm_start < vma->vm_mm->brk &&
+ vma->vm_end > vma->vm_mm->start_brk;
+}
+
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
+{
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+}
+
static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -869,6 +935,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+ (VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+ return is_shared_maywrite(vma->vm_flags);
+}
+
static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
@@ -976,7 +1053,7 @@ struct inode;
* compound_order() can be called without holding a reference, which means
* that niceties like page_folio() don't work. These callers should be
* prepared to handle wild return values. For example, PG_head may be
- * set before _folio_order is initialised, or this may be a tail page.
+ * set before the order is initialised, or this may be a tail page.
* See compaction.c for some good examples.
*/
static inline unsigned int compound_order(struct page *page)
@@ -985,7 +1062,7 @@ static inline unsigned int compound_order(struct page *page)
if (!test_bit(PG_head, &folio->flags))
return 0;
- return folio->_folio_order;
+ return folio->_flags_1 & 0xff;
}
/**
@@ -1001,7 +1078,7 @@ static inline unsigned int folio_order(struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
- return folio->_folio_order;
+ return folio->_flags_1 & 0xff;
}
#include <linux/huge_mm.h>
@@ -1072,11 +1149,6 @@ unsigned long vmalloc_to_pfn(const void *addr);
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
* is no special casing required.
*/
-
-#ifndef is_ioremap_addr
-#define is_ioremap_addr(x) is_vmalloc_addr(x)
-#endif
-
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
@@ -1220,33 +1292,6 @@ void folio_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
-/*
- * Compound pages have a destructor function. Provide a
- * prototype for that function and accessor functions.
- * These are _only_ valid on the head of a compound page.
- */
-typedef void compound_page_dtor(struct page *);
-
-/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
-enum compound_dtor_id {
- NULL_COMPOUND_DTOR,
- COMPOUND_PAGE_DTOR,
-#ifdef CONFIG_HUGETLB_PAGE
- HUGETLB_PAGE_DTOR,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- TRANSHUGE_PAGE_DTOR,
-#endif
- NR_COMPOUND_DTORS,
-};
-
-static inline void folio_set_compound_dtor(struct folio *folio,
- enum compound_dtor_id compound_dtor)
-{
- VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
- folio->_folio_dtor = compound_dtor;
-}
-
void destroy_large_folio(struct folio *folio);
/* Returns the number of bytes in this potentially compound page. */
@@ -1282,8 +1327,6 @@ static inline unsigned long thp_size(struct page *page)
return PAGE_SIZE << thp_order(page);
}
-void free_compound_page(struct page *page);
-
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
@@ -1294,15 +1337,15 @@ void free_compound_page(struct page *page);
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, vma);
return pte;
}
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+ struct page *page, unsigned int nr, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf);
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
/*
@@ -1651,26 +1694,26 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
- return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
+ return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return page->_last_cpupid;
+ return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
-extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
@@ -1678,11 +1721,12 @@ static inline void page_cpupid_reset_last(struct page *page)
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
{
int last_time;
- last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+ last_time = folio_xchg_last_cpupid(folio,
+ time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
@@ -1691,24 +1735,24 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
unsigned int pid_bit;
pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
- if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
- __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+ if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+ __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
}
}
#else /* !CONFIG_NUMA_BALANCING */
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
- return page_to_nid(page); /* XXX */
+ return folio_nid(folio); /* XXX */
}
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
{
return 0;
}
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return page_to_nid(page); /* XXX */
+ return folio_nid(folio); /* XXX */
}
static inline int cpupid_to_nid(int cpupid)
@@ -2006,7 +2050,7 @@ static inline long folio_nr_pages(struct folio *folio)
#ifdef CONFIG_64BIT
return folio->_folio_nr_pages;
#else
- return 1L << folio->_folio_order;
+ return 1L << (folio->_flags_1 & 0xff);
#endif
}
@@ -2024,7 +2068,7 @@ static inline unsigned long compound_nr(struct page *page)
#ifdef CONFIG_64BIT
return folio->_folio_nr_pages;
#else
- return 1L << folio->_folio_order;
+ return 1L << (folio->_flags_1 & 0xff);
#endif
}
@@ -2170,7 +2214,6 @@ static inline void *folio_address(const struct folio *folio)
return page_address(&folio->page);
}
-extern void *page_rmapping(struct page *page);
extern pgoff_t __page_file_index(struct page *page);
/*
@@ -2238,18 +2281,6 @@ extern void pagefault_out_of_memory(void);
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
/*
- * Flags passed to show_mem() and show_free_areas() to suppress output in
- * various contexts.
- */
-#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
-
-extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
-{
- __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
-}
-
-/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
@@ -2305,6 +2336,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@@ -2317,9 +2350,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
zap_page_range_single(vma, vma->vm_start,
vma->vm_end - vma->vm_start, NULL);
}
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked);
struct mmu_notifier_range;
@@ -2391,8 +2424,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
- void *buf, int len, unsigned int gup_flags);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -2403,6 +2434,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
int *locked);
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
unsigned long addr,
int gup_flags,
@@ -2410,12 +2444,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
{
struct page *page;
struct vm_area_struct *vma;
- int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+ int got;
+
+ if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+ return ERR_PTR(-EINVAL);
+
+ got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
if (got < 0)
return ERR_PTR(got);
- if (got == 0)
- return NULL;
vma = vma_lookup(mm, addr);
if (WARN_ON_ONCE(!vma)) {
@@ -2458,7 +2495,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
- bool need_rmap_locks);
+ bool need_rmap_locks, bool for_stack);
/*
* Flags used by change_protection(). For now we make it a bitmap so
@@ -2606,14 +2643,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss;
}
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
@@ -2766,42 +2795,93 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU */
+static inline struct ptdesc *virt_to_ptdesc(const void *x)
+{
+ return page_ptdesc(virt_to_page(x));
+}
+
+static inline void *ptdesc_to_virt(const struct ptdesc *pt)
+{
+ return page_to_virt(ptdesc_page(pt));
+}
+
+static inline void *ptdesc_address(const struct ptdesc *pt)
+{
+ return folio_address(ptdesc_folio(pt));
+}
+
+static inline bool pagetable_is_reserved(struct ptdesc *pt)
+{
+ return folio_test_reserved(ptdesc_folio(pt));
+}
+
+/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp: GFP flags
+ * @order: desired pagetable order
+ *
+ * pagetable_alloc allocates memory for page tables as well as a page table
+ * descriptor to describe that memory.
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+{
+ struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+ return page_ptdesc(page);
+}
+
+/**
+ * pagetable_free - Free pagetables
+ * @pt: The page table descriptor
+ *
+ * pagetable_free frees the memory of all page tables described by a page
+ * table descriptor and the memory for the descriptor itself.
+ */
+static inline void pagetable_free(struct ptdesc *pt)
+{
+ struct page *page = ptdesc_page(pt);
+
+ __free_pages(page, compound_order(page));
+}
+
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
-extern bool ptlock_alloc(struct page *page);
-extern void ptlock_free(struct page *page);
+bool ptlock_alloc(struct ptdesc *ptdesc);
+void ptlock_free(struct ptdesc *ptdesc);
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
- return page->ptl;
+ return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}
-static inline bool ptlock_alloc(struct page *page)
+static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
return true;
}
-static inline void ptlock_free(struct page *page)
+static inline void ptlock_free(struct ptdesc *ptdesc)
{
}
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
- return &page->ptl;
+ return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_page(*pmd));
+ return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}
-static inline bool ptlock_init(struct page *page)
+static inline bool ptlock_init(struct ptdesc *ptdesc)
{
/*
* prep_new_page() initialize page->private (and therefore page->ptl)
@@ -2810,10 +2890,10 @@ static inline bool ptlock_init(struct page *page)
* It can happen if arch try to use slab for page table allocation:
* slab code uses page->slab_cache, which share storage with page->ptl.
*/
- VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
- if (!ptlock_alloc(page))
+ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
+ if (!ptlock_alloc(ptdesc))
return false;
- spin_lock_init(ptlock_ptr(page));
+ spin_lock_init(ptlock_ptr(ptdesc));
return true;
}
@@ -2826,24 +2906,28 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
-static inline bool ptlock_init(struct page *page) { return true; }
-static inline void ptlock_free(struct page *page) {}
+static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
-static inline bool pgtable_pte_page_ctor(struct page *page)
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
- if (!ptlock_init(page))
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ if (!ptlock_init(ptdesc))
return false;
- __SetPageTable(page);
- inc_lruvec_page_state(page, NR_PAGETABLE);
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
-static inline void pgtable_pte_page_dtor(struct page *page)
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
{
- ptlock_free(page);
- __ClearPageTable(page);
- dec_lruvec_page_state(page, NR_PAGETABLE);
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ ptlock_free(ptdesc);
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
@@ -2892,28 +2976,33 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd)
return virt_to_page((void *)((unsigned long) pmd & mask));
}
+static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
+{
+ return page_ptdesc(pmd_pgtable_page(pmd));
+}
+
static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_pgtable_page(pmd));
+ return ptlock_ptr(pmd_ptdesc(pmd));
}
-static inline bool pmd_ptlock_init(struct page *page)
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page->pmd_huge_pte = NULL;
+ ptdesc->pmd_huge_pte = NULL;
#endif
- return ptlock_init(page);
+ return ptlock_init(ptdesc);
}
-static inline void pmd_ptlock_free(struct page *page)
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+ VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
#endif
- ptlock_free(page);
+ ptlock_free(ptdesc);
}
-#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
#else
@@ -2922,8 +3011,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
-static inline bool pmd_ptlock_init(struct page *page) { return true; }
-static inline void pmd_ptlock_free(struct page *page) {}
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -2936,20 +3025,24 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
- if (!pmd_ptlock_init(page))
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ if (!pmd_ptlock_init(ptdesc))
return false;
- __SetPageTable(page);
- inc_lruvec_page_state(page, NR_PAGETABLE);
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
-static inline void pgtable_pmd_page_dtor(struct page *page)
+static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
{
- pmd_ptlock_free(page);
- __ClearPageTable(page);
- dec_lruvec_page_state(page, NR_PAGETABLE);
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ pmd_ptlock_free(ptdesc);
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
/*
@@ -2971,6 +3064,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
return ptl;
}
+static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
+{
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+{
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
extern void __init pagecache_init(void);
extern void free_initmem(void);
@@ -3004,6 +3113,11 @@ static inline void mark_page_reserved(struct page *page)
adjust_managed_page_count(page, -1);
}
+static inline void free_reserved_ptdesc(struct ptdesc *pt)
+{
+ free_reserved_page(ptdesc_page(pt));
+}
+
/*
* Default method to free all the __init memory into the buddy system.
* The freed pages will be poisoned with pattern "poison" if it's within
@@ -3069,9 +3183,9 @@ extern void mem_init(void);
extern void __init mmap_init(void);
extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+static inline void show_mem(void)
{
- __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+ __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
@@ -3130,22 +3244,73 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *next);
extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
- struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
- unsigned long end, unsigned long vm_flags, struct anon_vma *,
- struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
- struct anon_vma_name *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
- unsigned long addr, int new_below);
-extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
- unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long vm_flags,
+ struct mempolicy *policy,
+ struct vm_userfaultfd_ctx uffd_ctx,
+ struct anon_vma_name *anon_name);
+
+/* We are about to modify the VMA's flags. */
+static inline struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags)
+{
+ return vma_modify(vmi, prev, vma, start, end, new_flags,
+ vma_policy(vma), vma->vm_userfaultfd_ctx,
+ anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or anon_name. */
+static inline struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long new_flags,
+ struct anon_vma_name *new_name)
+{
+ return vma_modify(vmi, prev, vma, start, end, new_flags,
+ vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
+}
+
+/* We are about to modify the VMA's memory policy. */
+static inline struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol)
+{
+ return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
+ new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or uffd context. */
+static inline struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags,
+ struct vm_userfaultfd_ctx new_ctx)
+{
+ return vma_modify(vmi, prev, vma, start, end, new_flags,
+ vma_policy(vma), new_ctx, anon_vma_name(vma));
+}
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3193,7 +3358,8 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long pgoff, unsigned long *populate, struct list_head *uf);
+ vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+ struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
@@ -3216,8 +3382,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
-/* These take the mm semaphore themselves */
-extern int __must_check vm_brk(unsigned long, unsigned long);
+/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
@@ -3281,15 +3446,26 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
return mtree_load(&mm->mm_mt, addr);
}
+static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_GROWSDOWN)
+ return stack_guard_gap;
+
+ /* See reasoning around the VM_SHADOW_STACK definition */
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
+ unsigned long gap = stack_guard_start_gap(vma);
unsigned long vm_start = vma->vm_start;
- if (vma->vm_flags & VM_GROWSDOWN) {
- vm_start -= stack_guard_gap;
- if (vm_start > vma->vm_start)
- vm_start = 0;
- }
+ vm_start -= gap;
+ if (vm_start > vma->vm_start)
+ vm_start = 0;
return vm_start;
}
@@ -3403,6 +3579,24 @@ static inline vm_fault_t vmf_error(int err)
return VM_FAULT_SIGBUS;
}
+/*
+ * Convert errno to return value for ->page_mkwrite() calls.
+ *
+ * This should eventually be merged with vmf_error() above, but will need a
+ * careful audit of all vmf_error() callers.
+ */
+static inline vm_fault_t vmf_fs_error(int err)
+{
+ if (err == 0)
+ return VM_FAULT_LOCKED;
+ if (err == -EFAULT || err == -EAGAIN)
+ return VM_FAULT_NOPAGE;
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ /* -ENOSPC, -EDQUOT, -EIO ... */
+ return VM_FAULT_SIGBUS;
+}
+
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags);
@@ -3509,8 +3703,8 @@ static inline bool debug_pagealloc_enabled(void)
}
/*
- * For use in fast paths after init_debug_pagealloc() has run, or when a
- * false negative result is not harmful when called too early.
+ * For use in fast paths after mem_debugging_and_hardening_init() has run,
+ * or when a false negative result is not harmful when called too early.
*/
static inline bool debug_pagealloc_enabled_static(void)
{
@@ -3665,13 +3859,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
- struct dev_pagemap *pgmap)
+#define VMEMMAP_RESERVE_NR 2
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
- return is_power_of_2(sizeof(struct page)) &&
- pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+ unsigned long nr_pages;
+ unsigned long nr_vmemmap_pages;
+
+ if (!pgmap || !is_power_of_2(sizeof(struct page)))
+ return false;
+
+ nr_pages = pgmap_vmemmap_nr(pgmap);
+ nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
+ /*
+ * For vmemmap optimization with DAX we need minimum 2 vmemmap
+ * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
+ */
+ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
+/*
+ * If we don't have an architecture override, use the generic rule
+ */
+#ifndef vmemmap_can_optimize
+#define vmemmap_can_optimize __vmemmap_can_optimize
+#endif
+
#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
@@ -3860,25 +4073,26 @@ static inline void mem_dump_obj(void *object) {}
#endif
/**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ * handle them.
* @seals: the seals to check
* @vma: the vma to operate on
*
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags. Return 0 if check pass, or <0 for errors.
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
*/
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
{
- if (seals & F_SEAL_FUTURE_WRITE) {
+ if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
/*
* New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
+ * write seals are active.
*/
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
return -EPERM;
/*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
* MAP_SHARED and read-only, take care to not allow mprotect to
* revert protections on such mappings. Do this only for shared
* mappings. For private mappings, don't need to mask
@@ -3922,4 +4136,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
#endif
+static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
+{
+ phys_addr_t paddr = pfn << PAGE_SHIFT;
+
+ return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+}
+
#endif /* _LINUX_MM_H */