aboutsummaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c238
1 files changed, 139 insertions, 99 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..e8af40b05549 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = {
};
#endif
-static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
@@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -277,17 +282,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
+ struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
bool ret = true;
- mmap_assert_locked(ctx->mm);
+ assert_fault_locked(vmf);
- ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
+ ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
@@ -308,10 +312,8 @@ out:
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
return false; /* should never get here */
}
@@ -325,11 +327,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
* threads.
*/
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
- unsigned long address,
- unsigned long flags,
+ struct vm_fault *vmf,
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
+ unsigned long address = vmf->address;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -338,7 +340,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte_t ptent;
bool ret = true;
- mmap_assert_locked(mm);
+ assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -427,20 +429,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
- * follow_hugetlb_page() to skip missing pages in the
+ * hugetlb_follow_page_mask() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
- * coredumping without mmap_lock and it ends up here.
+ * coredumping and it ends up here.
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
- /*
- * Coredumping runs without mmap_lock so we can only check that
- * the mmap_lock is held, if PF_DUMPCORE was not set.
- */
- mmap_assert_locked(mm);
+ assert_fault_locked(vmf);
ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
@@ -556,15 +554,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
- reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vma,
- vmf->address,
- vmf->flags, reason);
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
- mmap_read_unlock(mm);
+ release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
@@ -667,6 +662,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma,
vma->vm_flags & ~__VM_UFFD_FLAGS);
@@ -702,6 +698,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
@@ -783,6 +780,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
@@ -929,19 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- } else {
- prev = vma;
- }
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+ vma->vm_end, new_flags,
+ NULL_VM_UFFD_CTX);
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ prev = vma;
}
mmap_write_unlock(mm);
mmput(mm);
@@ -1289,13 +1283,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
__wake_userfault(ctx, range);
}
-static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+static __always_inline int validate_unaligned_range(
+ struct mm_struct *mm, __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK)
- return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
@@ -1306,9 +1298,20 @@ static __always_inline int validate_range(struct mm_struct *mm,
return -EINVAL;
if (len > task_size - start)
return -EINVAL;
+ if (start + len <= start)
+ return -EINVAL;
return 0;
}
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ return validate_unaligned_range(mm, start, len);
+}
+
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1322,7 +1325,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool basic_ioctls;
unsigned long start, end, vma_end;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1396,7 +1399,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur, vm_flags))
+ if (!vma_can_userfault(cur, vm_flags, wp_async))
goto out_unlock;
/*
@@ -1457,7 +1460,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1475,33 +1478,20 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }),
- anon_vma_name(vma));
- if (prev) {
- /* vma_merge() invalidated the mas */
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx});
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
@@ -1557,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1611,7 +1601,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur, cur->vm_flags))
+ if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
goto out_unlock;
found = true;
@@ -1627,7 +1617,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
/*
* Nothing to do: this vma is already registered into this
@@ -1660,31 +1650,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags, NULL_VM_UFFD_CTX);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -1757,17 +1735,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
+ ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
+ uffdio_copy.len);
+ if (ret)
+ goto out;
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
- /*
- * double check for wraparound just in case. copy_from_user()
- * will later check uffdio_copy.src + uffdio_copy.len to fit
- * in the userland range.
- */
+
ret = -EINVAL;
- if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
- goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
@@ -1927,11 +1903,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
ret = -EINVAL;
- /* double check for wraparound just in case. */
- if (uffdio_continue.range.start + uffdio_continue.range.len <=
- uffdio_continue.range.start) {
- goto out;
- }
if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
UFFDIO_CONTINUE_MODE_WP))
goto out;
@@ -1965,6 +1936,66 @@ out:
return ret;
}
+static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_poison uffdio_poison;
+ struct uffdio_poison __user *user_uffdio_poison;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_poison = (struct uffdio_poison __user *)arg;
+
+ ret = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_poison, user_uffdio_poison,
+ /* don't copy the output fields */
+ sizeof(uffdio_poison) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len,
+ &ctx->mmap_changing, 0);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
+ range.start = uffdio_poison.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -1998,6 +2029,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
+
+ /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ if (features & UFFD_FEATURE_WP_ASYNC)
+ features |= UFFD_FEATURE_WP_UNPOPULATED;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2010,6 +2046,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
@@ -2066,6 +2103,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_CONTINUE:
ret = userfaultfd_continue(ctx, arg);
break;
+ case UFFDIO_POISON:
+ ret = userfaultfd_poison(ctx, arg);
+ break;
}
return ret;
}