diff options
Diffstat (limited to 'kernel')
55 files changed, 1017 insertions, 690 deletions
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 6432a37ac1c9..c565fbf66ac8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 280b4720c7a0..9f8c05228d6d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1940,6 +1940,7 @@ void __audit_uring_exit(int success, long code) goto out; } + audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case @@ -1981,7 +1982,6 @@ void __audit_uring_exit(int success, long code) audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(ctx, success, code); audit_log_exit(); out: @@ -2065,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(context); + audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(context, success, return_code); audit_log_exit(); out: diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fa71d58b7ded..761998fda762 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -335,6 +335,7 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) +BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) bool bpf_lsm_is_sleepable_hook(u32 btf_id) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 59b7eb60d5b4..4a400cd63731 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -921,8 +921,10 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, pos++; } } + + /* no link or prog match, skip the cgroup of this layer */ + continue; found: - BUG_ON(!cg); progs = rcu_dereference_protected( desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..3d9eb3ae334c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -971,7 +971,7 @@ pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { + if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 85fa9dbfa8bf..82c61612f382 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk) struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); - socks = __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); + socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d40d98428a..27760627370d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5197,7 +5197,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return &bpf_sys_bpf_proto; + return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 096fdac70165..3eadb14e090b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6066,6 +6066,9 @@ skip_type_check: return -EACCES; } meta->mem_size = reg->var_off.value; + err = mark_chain_precision(env, regno); + if (err) + return err; break; case ARG_PTR_TO_INT: case ARG_PTR_TO_LONG: @@ -7030,8 +7033,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *regs = cur_regs(env), *reg; struct bpf_map *map = meta->map_ptr; - struct tnum range; - u64 val; + u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) @@ -7041,10 +7043,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - range = tnum_range(0, map->max_entries - 1); reg = ®s[BPF_REG_3]; + val = reg->var_off.value; + max = map->max_entries; - if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + if (!(register_is_const(reg) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -7052,8 +7055,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, err = mark_chain_precision(env, BPF_REG_3); if (err) return err; - - val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); else if (!bpf_map_key_poisoned(aux) && diff --git a/kernel/cfi.c b/kernel/cfi.c index 2046276ee234..08caad776717 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -1,339 +1,101 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Clang Control Flow Integrity (CFI) error and slowpath handling. + * Clang Control Flow Integrity (CFI) error handling. * - * Copyright (C) 2021 Google LLC + * Copyright (C) 2022 Google LLC */ -#include <linux/hardirq.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/printk.h> -#include <linux/ratelimit.h> -#include <linux/rcupdate.h> -#include <linux/vmalloc.h> -#include <asm/cacheflush.h> -#include <asm/set_memory.h> +#include <linux/cfi.h> -/* Compiler-defined handler names */ -#ifdef CONFIG_CFI_PERMISSIVE -#define cfi_failure_handler __ubsan_handle_cfi_check_fail -#else -#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort -#endif - -static inline void handle_cfi_failure(void *ptr) +enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, + unsigned long *target, u32 type) { - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) - WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr); + if (target) + pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n", + (void *)addr, (void *)*target, type); else - panic("CFI failure (target: %pS)\n", ptr); -} - -#ifdef CONFIG_MODULES -#ifdef CONFIG_CFI_CLANG_SHADOW -/* - * Index type. A 16-bit index can address at most (2^16)-2 pages (taking - * into account SHADOW_INVALID), i.e. ~256M with 4k pages. - */ -typedef u16 shadow_t; -#define SHADOW_INVALID ((shadow_t)~0UL) - -struct cfi_shadow { - /* Page index for the beginning of the shadow */ - unsigned long base; - /* An array of __cfi_check locations (as indices to the shadow) */ - shadow_t shadow[1]; -} __packed; - -/* - * The shadow covers ~128M from the beginning of the module region. If - * the region is larger, we fall back to __module_address for the rest. - */ -#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT) - -/* The in-memory size of struct cfi_shadow, always at least one page */ -#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT) -#define SHADOW_PAGES max(1UL, __SHADOW_PAGES) -#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT) - -/* The actual size of the shadow array, minus metadata */ -#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow)) -#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t)) - -static DEFINE_MUTEX(shadow_update_lock); -static struct cfi_shadow __rcu *cfi_shadow __read_mostly; + pr_err("CFI failure at %pS (no target information)\n", + (void *)addr); -/* Returns the index in the shadow for the given address */ -static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr) -{ - unsigned long index; - unsigned long page = ptr >> PAGE_SHIFT; - - if (unlikely(page < s->base)) - return -1; /* Outside of module area */ - - index = page - s->base; - - if (index >= SHADOW_ARR_SLOTS) - return -1; /* Cannot be addressed with shadow */ - - return (int)index; -} - -/* Returns the page address for an index in the shadow */ -static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s, - int index) -{ - if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) - return 0; - - return (s->base + index) << PAGE_SHIFT; -} - -/* Returns the __cfi_check function address for the given shadow location */ -static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s, - int index) -{ - if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) - return 0; - - if (unlikely(s->shadow[index] == SHADOW_INVALID)) - return 0; - - /* __cfi_check is always page aligned */ - return (s->base + s->shadow[index]) << PAGE_SHIFT; -} - -static void prepare_next_shadow(const struct cfi_shadow __rcu *prev, - struct cfi_shadow *next) -{ - int i, index, check; - - /* Mark everything invalid */ - memset(next->shadow, 0xFF, SHADOW_ARR_SIZE); - - if (!prev) - return; /* No previous shadow */ - - /* If the base address didn't change, an update is not needed */ - if (prev->base == next->base) { - memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE); - return; - } - - /* Convert the previous shadow to the new address range */ - for (i = 0; i < SHADOW_ARR_SLOTS; ++i) { - if (prev->shadow[i] == SHADOW_INVALID) - continue; - - index = ptr_to_shadow(next, shadow_to_ptr(prev, i)); - if (index < 0) - continue; - - check = ptr_to_shadow(next, - shadow_to_check_fn(prev, prev->shadow[i])); - if (check < 0) - continue; - - next->shadow[index] = (shadow_t)check; - } -} - -static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod, - unsigned long min_addr, unsigned long max_addr) -{ - int check_index; - unsigned long check = (unsigned long)mod->cfi_check; - unsigned long ptr; - - if (unlikely(!PAGE_ALIGNED(check))) { - pr_warn("cfi: not using shadow for module %s\n", mod->name); - return; + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + __warn(NULL, 0, (void *)addr, 0, regs, NULL); + return BUG_TRAP_TYPE_WARN; } - check_index = ptr_to_shadow(s, check); - if (check_index < 0) - return; /* Module not addressable with shadow */ - - /* For each page, store the check function index in the shadow */ - for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { - int index = ptr_to_shadow(s, ptr); - - if (index >= 0) { - /* Each page must only contain one module */ - WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID); - s->shadow[index] = (shadow_t)check_index; - } - } + return BUG_TRAP_TYPE_BUG; } -static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod, - unsigned long min_addr, unsigned long max_addr) +#ifdef CONFIG_ARCH_USES_CFI_TRAPS +static inline unsigned long trap_address(s32 *p) { - unsigned long ptr; - - for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { - int index = ptr_to_shadow(s, ptr); - - if (index >= 0) - s->shadow[index] = SHADOW_INVALID; - } + return (unsigned long)((long)p + (long)*p); } -typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *, - unsigned long min_addr, unsigned long max_addr); - -static void update_shadow(struct module *mod, unsigned long base_addr, - update_shadow_fn fn) +static bool is_trap(unsigned long addr, s32 *start, s32 *end) { - struct cfi_shadow *prev; - struct cfi_shadow *next; - unsigned long min_addr, max_addr; - - next = vmalloc(SHADOW_SIZE); - - mutex_lock(&shadow_update_lock); - prev = rcu_dereference_protected(cfi_shadow, - mutex_is_locked(&shadow_update_lock)); - - if (next) { - next->base = base_addr >> PAGE_SHIFT; - prepare_next_shadow(prev, next); + s32 *p; - min_addr = (unsigned long)mod->core_layout.base; - max_addr = min_addr + mod->core_layout.text_size; - fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK); - - set_memory_ro((unsigned long)next, SHADOW_PAGES); - } - - rcu_assign_pointer(cfi_shadow, next); - mutex_unlock(&shadow_update_lock); - synchronize_rcu(); - - if (prev) { - set_memory_rw((unsigned long)prev, SHADOW_PAGES); - vfree(prev); + for (p = start; p < end; ++p) { + if (trap_address(p) == addr) + return true; } -} -void cfi_module_add(struct module *mod, unsigned long base_addr) -{ - update_shadow(mod, base_addr, add_module_to_shadow); + return false; } -void cfi_module_remove(struct module *mod, unsigned long base_addr) -{ - update_shadow(mod, base_addr, remove_module_from_shadow); -} - -static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s, - unsigned long ptr) +#ifdef CONFIG_MODULES +/* Populates `kcfi_trap(_end)?` fields in `struct module`. */ +void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) { - int index; - - if (unlikely(!s)) - return NULL; /* No shadow available */ - - index = ptr_to_shadow(s, ptr); - if (index < 0) - return NULL; /* Cannot be addressed with shadow */ + char *secstrings; + unsigned int i; - return (cfi_check_fn)shadow_to_check_fn(s, index); -} - -static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) -{ - cfi_check_fn fn; + mod->kcfi_traps = NULL; + mod->kcfi_traps_end = NULL; - rcu_read_lock_sched_notrace(); - fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr); - rcu_read_unlock_sched_notrace(); + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - return fn; -} - -#else /* !CONFIG_CFI_CLANG_SHADOW */ + for (i = 1; i < hdr->e_shnum; i++) { + if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps")) + continue; -static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) -{ - return NULL; + mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr; + mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size); + break; + } } -#endif /* CONFIG_CFI_CLANG_SHADOW */ - -static inline cfi_check_fn find_module_check_fn(unsigned long ptr) +static bool is_module_cfi_trap(unsigned long addr) { - cfi_check_fn fn = NULL; struct module *mod; + bool found = false; rcu_read_lock_sched_notrace(); - mod = __module_address(ptr); - if (mod) - fn = mod->cfi_check; - rcu_read_unlock_sched_notrace(); - - return fn; -} - -static inline cfi_check_fn find_check_fn(unsigned long ptr) -{ - cfi_check_fn fn = NULL; - unsigned long flags; - bool rcu_idle; - - if (is_kernel_text(ptr)) - return __cfi_check; - /* - * Indirect call checks can happen when RCU is not watching. Both - * the shadow and __module_address use RCU, so we need to wake it - * up if necessary. - */ - rcu_idle = !rcu_is_watching(); - if (rcu_idle) { - local_irq_save(flags); - ct_irq_enter(); - } - - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); - if (!fn) - fn = find_module_check_fn(ptr); + mod = __module_address(addr); + if (mod) + found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - if (rcu_idle) { - ct_irq_exit(); - local_irq_restore(flags); - } + rcu_read_unlock_sched_notrace(); - return fn; + return found; } - -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +#else /* CONFIG_MODULES */ +static inline bool is_module_cfi_trap(unsigned long addr) { - cfi_check_fn fn = find_check_fn((unsigned long)ptr); - - if (likely(fn)) - fn(id, ptr, diag); - else /* Don't allow unchecked modules */ - handle_cfi_failure(ptr); + return false; } -EXPORT_SYMBOL(__cfi_slowpath_diag); +#endif /* CONFIG_MODULES */ -#else /* !CONFIG_MODULES */ +extern s32 __start___kcfi_traps[]; +extern s32 __stop___kcfi_traps[]; -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +bool is_cfi_trap(unsigned long addr) { - handle_cfi_failure(ptr); /* No modules */ -} -EXPORT_SYMBOL(__cfi_slowpath_diag); + if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps)) + return true; -#endif /* CONFIG_MODULES */ - -void cfi_failure_handler(void *data, void *ptr, void *vtable) -{ - handle_cfi_failure(ptr); + return is_module_cfi_trap(addr); } -EXPORT_SYMBOL(cfi_failure_handler); +#endif /* CONFIG_ARCH_USES_CFI_TRAPS */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2ade21b54dc4..ff6a8099eb2a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); mutex_unlock(&cgroup_mutex); return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ffaccd6373f1..5f2090d051ac 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1820,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); + synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } @@ -2370,6 +2371,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) EXPORT_SYMBOL_GPL(task_cgroup_path); /** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(bool lock_threadgroup) +{ + cpus_read_lock(); + if (lock_threadgroup) + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(bool lock_threadgroup) +{ + if (lock_threadgroup) + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + +/** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context @@ -2841,8 +2883,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; @@ -2859,12 +2900,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } + *threadgroup_locked = pid || threadgroup; + cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { @@ -2895,17 +2932,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; - } + cgroup_attach_unlock(*threadgroup_locked); + *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; @@ -2913,8 +2947,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(threadgroup_locked); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -3000,8 +3034,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - if (has_tasks) - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3022,8 +3055,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - if (has_tasks) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(has_tasks); return ret; } @@ -3698,7 +3730,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); @@ -4971,13 +5003,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool locked; + bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5003,7 +5035,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -6017,6 +6049,9 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!kn) goto out; + if (kernfs_type(kn) != KERNFS_DIR) + goto put; + rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); @@ -6024,7 +6059,7 @@ struct cgroup *cgroup_get_from_id(u64 id) cgrp = NULL; rcu_read_unlock(); - +put: kernfs_put(kn); out: return cgrp; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58aadfda9b8b..1f3a55297f39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config new file mode 100644 index 000000000000..38a7c5362c9c --- /dev/null +++ b/kernel/configs/rust.config @@ -0,0 +1 @@ +CONFIG_RUST=y diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 07b26df453a9..a0eb4d5cf557 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -494,6 +494,7 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); #ifdef CONFIG_KALLSYMS_BASE_RELATIVE diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 2caafd13f8aa..18c93c2276ca 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -350,11 +350,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, unsigned long *flags) { - unsigned int max_range = dma_get_max_seg_size(ref->dev); struct dma_debug_entry *entry, index = *ref; - unsigned int range = 0; + int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1); - while (range <= max_range) { + for (int i = 0; i < limit; i++) { entry = __hash_bucket_find(*bucket, ref, containing_match); if (entry) @@ -364,7 +363,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, * Nothing found, go back a hash bucket */ put_hash_bucket(*bucket, *flags); - range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de7..27f272381cf2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -707,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); -int dma_supported(struct device *dev, u64 mask) +static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -721,7 +721,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; return ops->dma_supported(dev, mask); } -EXPORT_SYMBOL(dma_supported); bool dma_pci_p2pdma_supported(struct device *dev) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c5a9190b218f..0ef6b12f961d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -326,9 +326,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; - if (nslabs < IO_TLB_MIN_SLABS) - panic("%s: nslabs = %lu too small\n", __func__, nslabs); - /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest @@ -341,8 +338,7 @@ retry: else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { - pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", - __func__, bytes); + pr_warn("%s: failed to allocate tlb structure\n", __func__); return; } @@ -579,7 +575,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size } } -#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) +{ + return start + (idx << IO_TLB_SHIFT); +} /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. @@ -765,7 +764,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, /* * When dir == DMA_FROM_DEVICE we could omit the copy from the orig * to the tlb buffer, if we knew for sure the device will - * overwirte the entire current content. But we don't. Thus + * overwrite the entire current content. But we don't. Thus * unconditional bounce may prevent leaking swiotlb content (i.e. * kernel memory) to user-space. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..ff4bffc502c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6893,9 +6893,16 @@ static void perf_output_read_group(struct perf_output_handle *handle, { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; + unsigned long flags; u64 values[6]; int n = 0; + /* + * Disabling interrupts avoids all counter scheduling + * (context switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -6931,6 +6938,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); } + + local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf69..2b6bd511c6ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1225,6 +1225,7 @@ void mmput_async(struct mm_struct *mm) schedule_work(&mm->async_put_work); } } +EXPORT_SYMBOL_GPL(mmput_async); #endif /** @@ -2046,11 +2047,8 @@ static __latent_entropy struct task_struct *copy_process( /* * If the new process will be in a different time namespace * do not allow it to share VM or a thread group with the forking task. - * - * On vfork, the child process enters the target time namespace only - * after exec. */ - if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) { + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3e7e2c2ad2f7..60c20f301a6b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -50,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, data = &kallsyms_names[off]; len = *data; data++; + off++; + + /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ + if ((len & 0x80) != 0) { + len = (len & 0x7F) | (*data << 7); + data++; + off++; + } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ - off += len + 1; + off += len; /* * For every byte on the compressed symbol data, copy the table @@ -108,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off) static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; - int i; + int i, len; /* * Use the closest marker we have. We have markers every 256 positions, @@ -122,8 +130,18 @@ static unsigned int get_symbol_offset(unsigned long pos) * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ - for (i = 0; i < (pos & 0xFF); i++) - name = name + (*name) + 1; + for (i = 0; i < (pos & 0xFF); i++) { + len = *name; + + /* + * If MSB is 1, it is a "big" symbol, so we need to look into + * the next byte (and skip it, too). + */ + if ((len & 0x80) != 0) + len = ((len & 0x7F) | (name[1] << 7)) + 1; + + name = name + len + 1; + } return name - kallsyms_names; } @@ -159,7 +177,6 @@ static bool cleanup_symbol_name(char *s) * character in an identifier in C. Suffixes observed: * - foo.llvm.[0-9a-f]+ * - foo.[0-9a-f]+ - * - foo.[0-9a-f]+.cfi_jt */ res = strchr(s, '.'); if (res) { @@ -167,22 +184,6 @@ static bool cleanup_symbol_name(char *s) return true; } - if (!IS_ENABLED(CONFIG_CFI_CLANG) || - !IS_ENABLED(CONFIG_LTO_CLANG_THIN) || - CONFIG_CLANG_VERSION >= 130000) - return false; - - /* - * Prior to LLVM 13, the following suffixes were observed when thinLTO - * and CFI are both enabled: - * - foo$[0-9]+ - */ - res = strrchr(s, '$'); - if (res) { - *res = '\0'; - return true; - } - return false; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 80697e5e03e4..ca9d834d0b84 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1562,6 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!(core_kernel_text((unsigned long) p->addr) || is_module_text_address((unsigned long) p->addr)) || + in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || @@ -1707,11 +1708,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If 'kprobes_all_disarmed' is set, 'orig_p' - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; diff --git a/kernel/kthread.c b/kernel/kthread.c index 3c677918d8f2..28a6b7ab4a0f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1050,8 +1050,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_FUNCTION_MISMATCH(timer->function, - kthread_delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc475e62279d..ec06ce59d728 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -213,7 +213,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, * we use the smallest/strictest upper bound possible (56, based on * the current definition of MODULE_NAME_LEN) to prevent overflows. */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512); relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ @@ -227,7 +227,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%127[^,],%lu", + ".klp.sym.%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", diff --git a/kernel/module/main.c b/kernel/module/main.c index 6a477c622544..70c0b2c6fef8 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -53,6 +53,7 @@ #include <linux/bsearch.h> #include <linux/dynamic_debug.h> #include <linux/audit.h> +#include <linux/cfi.h> #include <uapi/linux/module.h> #include "internal.h" @@ -1144,8 +1145,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -static void cfi_cleanup(struct module *mod); - /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1190,9 +1189,6 @@ static void free_module(struct module *mod) mod->name); mutex_unlock(&module_mutex); - /* Clean up CFI for the module. */ - cfi_cleanup(mod); - /* This may be empty, but that's OK */ module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); @@ -2099,7 +2095,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif -#ifdef CONFIG_KUNIT +#if IS_ENABLED(CONFIG_KUNIT) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); @@ -2602,8 +2598,9 @@ static int complete_formation(struct module *mod, struct load_info *info) if (err < 0) goto out; - /* This relies on module_mutex for list integrity. */ + /* These rely on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + module_cfi_finalize(info->hdr, info->sechdrs, mod); if (module_check_misalignment(mod)) goto out_misaligned; @@ -2665,8 +2662,6 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } -static void cfi_init(struct module *mod); - /* * Allocate and load the module: note that size of section 0 is always * zero, and we rely on this for optional sections. @@ -2796,9 +2791,6 @@ static int load_module(struct load_info *info, const char __user *uargs, flush_module_icache(mod); - /* Setup CFI for the module. */ - cfi_init(mod); - /* Now copy in args */ mod->args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(mod->args)) { @@ -2875,7 +2867,6 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); kfree(mod->args); free_arch_cleanup: - cfi_cleanup(mod); module_arch_cleanup(mod); free_modinfo: free_modinfo(mod); @@ -2961,41 +2952,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size) return ((void *)addr >= start && (void *)addr < start + size); } -static void cfi_init(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - initcall_t *init; -#ifdef CONFIG_MODULE_UNLOAD - exitcall_t *exit; -#endif - - rcu_read_lock_sched(); - mod->cfi_check = (cfi_check_fn) - find_kallsyms_symbol_value(mod, "__cfi_check"); - init = (initcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); - /* Fix init/exit functions to point to the CFI jump table */ - if (init) - mod->init = *init; -#ifdef CONFIG_MODULE_UNLOAD - exit = (exitcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); - if (exit) - mod->exit = *exit; -#endif - rcu_read_unlock_sched(); - - cfi_module_add(mod, mod_tree.addr_min); -#endif -} - -static void cfi_cleanup(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - cfi_module_remove(mod, mod_tree.addr_min); -#endif -} - /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ char *module_flags(struct module *mod, char *buf, bool show_state) { diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b4cbb406bc28..eec72ca962e2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -179,8 +179,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); - if ((flags & CLONE_VM) == 0) - timens_on_fork(new_ns, tsk); + timens_on_fork(new_ns, tsk); tsk->nsproxy = new_ns; return 0; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d8e1b270a065..503c2aa845a4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -84,10 +84,15 @@ torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress test torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); +torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); +torture_param(bool, gp_cond_exp_full, false, + "Use conditional/async full-stateexpedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); +torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); +torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -194,16 +199,24 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_GET_EXP 6 -#define RTWS_COND_SYNC 7 -#define RTWS_COND_SYNC_EXP 8 -#define RTWS_POLL_GET 9 -#define RTWS_POLL_GET_EXP 10 -#define RTWS_POLL_WAIT 11 -#define RTWS_POLL_WAIT_EXP 12 -#define RTWS_SYNC 13 -#define RTWS_STUTTER 14 -#define RTWS_STOPPING 15 +#define RTWS_COND_GET_FULL 6 +#define RTWS_COND_GET_EXP 7 +#define RTWS_COND_GET_EXP_FULL 8 +#define RTWS_COND_SYNC 9 +#define RTWS_COND_SYNC_FULL 10 +#define RTWS_COND_SYNC_EXP 11 +#define RTWS_COND_SYNC_EXP_FULL 12 +#define RTWS_POLL_GET 13 +#define RTWS_POLL_GET_FULL 14 +#define RTWS_POLL_GET_EXP 15 +#define RTWS_POLL_GET_EXP_FULL 16 +#define RTWS_POLL_WAIT 17 +#define RTWS_POLL_WAIT_FULL 18 +#define RTWS_POLL_WAIT_EXP 19 +#define RTWS_POLL_WAIT_EXP_FULL 20 +#define RTWS_SYNC 21 +#define RTWS_STUTTER 22 +#define RTWS_STOPPING 23 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -211,13 +224,21 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_FULL", "RTWS_COND_GET_EXP", + "RTWS_COND_GET_EXP_FULL", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_FULL", "RTWS_COND_SYNC_EXP", + "RTWS_COND_SYNC_EXP_FULL", "RTWS_POLL_GET", + "RTWS_POLL_GET_FULL", "RTWS_POLL_GET_EXP", + "RTWS_POLL_GET_EXP_FULL", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_FULL", "RTWS_POLL_WAIT_EXP", + "RTWS_POLL_WAIT_EXP_FULL", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -332,13 +353,21 @@ struct rcu_torture_ops { void (*exp_sync)(void); unsigned long (*get_gp_state_exp)(void); unsigned long (*start_gp_poll_exp)(void); + void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state_exp)(unsigned long oldstate); void (*cond_sync_exp)(unsigned long oldstate); + void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*get_gp_state)(void); + void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*get_gp_completed)(void); + void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); + void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); + bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); + void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -489,6 +518,11 @@ static void rcu_sync_torture_init(void) INIT_LIST_HEAD(&rcu_torture_removed); } +static bool rcu_poll_need_2gp(bool poll, bool poll_full) +{ + return poll; +} + static struct rcu_torture_ops rcu_ops = { .ttype = RCU_FLAVOR, .init = rcu_sync_torture_init, @@ -502,12 +536,19 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .get_gp_state_full = get_state_synchronize_rcu_full, .get_gp_completed = get_completed_synchronize_rcu, + .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, + .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, + .poll_gp_state_full = poll_state_synchronize_rcu_full, + .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, + .cond_sync_full = cond_synchronize_rcu_full, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, .call = call_rcu, @@ -709,6 +750,9 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -1148,15 +1192,35 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp; - bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll; - bool gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full; + bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp; + bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && - !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = - gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && + !gp_cond_exp1 && + !gp_cond_full1 && + !gp_cond_exp_full1 && + !gp_exp1 && + !gp_poll_exp1 && + !gp_poll_exp_full1 && + !gp_normal1 && + !gp_poll1 && + !gp_poll_full1 && + !gp_sync1) { + gp_cond1 = true; + gp_cond_exp1 = true; + gp_cond_full1 = true; + gp_cond_exp_full1 = true; + gp_exp1 = true; + gp_poll_exp1 = true; + gp_poll_exp_full1 = true; + gp_normal1 = true; + gp_poll1 = true; + gp_poll_full1 = true; + gp_sync1 = true; + } if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); @@ -1169,6 +1233,19 @@ static void rcu_torture_write_types(void) } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { pr_alert("%s: gp_cond_exp without primitives.\n", __func__); } + if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) { + synctype[nsynctypes++] = RTWS_COND_GET_FULL; + pr_info("%s: Testing conditional full-state GPs.\n", __func__); + } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) { + pr_alert("%s: gp_cond_full without primitives.\n", __func__); + } + if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL; + pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__); + } else if (gp_cond_exp_full && + (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) { + pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1187,12 +1264,25 @@ static void rcu_torture_write_types(void) } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_full1 && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_FULL; + pr_info("%s: Testing polling full-state GPs.\n", __func__); + } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_full without primitives.\n", __func__); + } if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { synctype[nsynctypes++] = RTWS_POLL_GET_EXP; pr_info("%s: Testing polling expedited GPs.\n", __func__); } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { pr_alert("%s: gp_poll_exp without primitives.\n", __func__); } + if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL; + pr_info("%s: Testing polling full-state expedited GPs.\n", __func__); + } else if (gp_poll_exp_full && + (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1202,6 +1292,40 @@ static void rcu_torture_write_types(void) } /* + * Do the specified rcu_torture_writer() synchronous grace period, + * while also testing out the polled APIs. Note well that the single-CPU + * grace-period optimizations must be accounted for. + */ +static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) +{ + unsigned long cookie; + struct rcu_gp_oldstate cookie_full; + bool dopoll; + bool dopoll_full; + unsigned long r = torture_random(trsp); + + dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300); + dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00); + if (dopoll || dopoll_full) + cpus_read_lock(); + if (dopoll) + cookie = cur_ops->get_gp_state(); + if (dopoll_full) + cur_ops->get_gp_state_full(&cookie_full); + if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full)) + sync(); + sync(); + WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %pS() online %*pbl.", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 4 failed %pS() online %*pbl", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + if (dopoll || dopoll_full) + cpus_read_unlock(); +} + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -1212,8 +1336,10 @@ rcu_torture_writer(void *arg) bool boot_ended; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; + struct rcu_gp_oldstate cookie_full; int expediting = 0; unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap_full; int i; int idx; int oldnice = task_nice(current); @@ -1261,11 +1387,12 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + + // Make sure readers block polled grace periods. if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { idx = cur_ops->readlock(); cookie = cur_ops->get_gp_state(); - WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && - cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(cookie), "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), @@ -1277,6 +1404,21 @@ rcu_torture_writer(void *arg) } cur_ops->readunlock(idx); } + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) { + idx = cur_ops->readlock(); + cur_ops->get_gp_state_full(&cookie_full); + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 5 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + if (cur_ops->get_gp_completed_full) { + cur_ops->get_gp_completed_full(&cookie_full); + WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); + } + cur_ops->readunlock(idx); + } switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1284,12 +1426,7 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); - cur_ops->exp_sync(); - cur_ops->exp_sync(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + do_rtws_sync(&rand, cur_ops->exp_sync); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1308,6 +1445,22 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync_exp(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_FULL: + rcu_torture_writer_state = RTWS_COND_GET_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_FULL; + cur_ops->cond_sync_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; + cur_ops->cond_sync_exp_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); @@ -1317,6 +1470,15 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_FULL; + cur_ops->start_gp_poll_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET_EXP: rcu_torture_writer_state = RTWS_POLL_GET_EXP; gp_snap = cur_ops->start_gp_poll_exp(); @@ -1326,14 +1488,18 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL; + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); - cur_ops->sync(); - cur_ops->sync(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + do_rtws_sync(&rand, cur_ops->sync); rcu_torture_pipe_update(old_rp); break; default: @@ -1400,6 +1566,7 @@ static int rcu_torture_fakewriter(void *arg) { unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap_full; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); @@ -1438,6 +1605,16 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync_exp(gp_snap); break; + case RTWS_COND_GET_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_full(&gp_snap_full); + break; + case RTWS_COND_GET_EXP_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp_full(&gp_snap_full); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1445,6 +1622,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_FULL: + cur_ops->start_gp_poll_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_POLL_GET_EXP: gp_snap = cur_ops->start_gp_poll_exp(); while (!cur_ops->poll_gp_state_exp(gp_snap)) { @@ -1452,6 +1636,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_EXP_FULL: + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; @@ -1715,7 +1906,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, */ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) { + bool checkpolling = !(torture_random(trsp) & 0xfff); unsigned long cookie; + struct rcu_gp_oldstate cookie_full; int i; unsigned long started; unsigned long completed; @@ -1731,8 +1924,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + if (checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + cur_ops->get_gp_state_full(&cookie_full); + } started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1766,13 +1963,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), - "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", - __func__, - rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + if (checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 6 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + } rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially @@ -2600,12 +2806,12 @@ static int rcutorture_oom_notify(struct notifier_block *self, for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); - rcu_barrier(); + cur_ops->cb_barrier(); ncbs = 0; for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); - rcu_barrier(); + cur_ops->cb_barrier(); ncbs = 0; for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 92c002d65482..33adafdad261 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { - unsigned short cookie; + unsigned long cookie; cookie = get_state_synchronize_srcu(ssp); - if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) return; WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { @@ -215,7 +215,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) barrier(); ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; barrier(); - return ret & USHRT_MAX; + return ret; } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -240,10 +240,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ret; + return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 83c7e6620d40..f5bf6fb430da 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); // If the grace-period kthread is running, use it. @@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); rcu_read_unlock(); + cond_resched_tasks_rcu_qs(); } // Only after all running tasks have been accounted for is it @@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); } raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list is populated. @@ -1619,6 +1621,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list scan has completed. diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f0561ee16b9c..a33a8d4942c3 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -158,6 +158,10 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +static void tiny_rcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } + + if (!__is_kvfree_rcu_offset((unsigned long)head->func)) + WRITE_ONCE(head->func, tiny_rcu_leak_callback); + return; + } + head->func = func; head->next = NULL; @@ -184,6 +199,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) EXPORT_SYMBOL_GPL(call_rcu); /* + * Store a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/* * Return a grace-period-counter "cookie". For more information, * see the Tree RCU header comment. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79aea7df4345..6bb8e72bc815 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -76,6 +76,7 @@ /* Data structures. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { + .gpwrap = true, #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_RCU_CORE, #endif @@ -1755,6 +1756,8 @@ static noinline void rcu_gp_cleanup(void) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); + if (!rnp->parent) + smp_mb(); // Order against failing poll_state_synchronize_rcu_full(). rdp = this_cpu_ptr(&rcu_data); if (rnp == rdp->mynode) needgp = __note_gp_changes(rnp, rdp) || needgp; @@ -2341,8 +2344,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); - if (user) - rcu_tasks_classic_qs(current, false); + if (user || rcu_is_cpu_rrupt_from_idle()) + rcu_note_voluntary_context_switch(current); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -2832,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 #define FREE_N_CHANNELS 2 @@ -3093,6 +3096,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) return !!krcp->head; } +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(system_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(system_wq, &krcp->monitor_work, delay); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3150,7 +3168,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // work to repeat an attempt. Because previous batches are // still in progress. if (need_offload_krc(krcp)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } @@ -3183,15 +3201,16 @@ static void fill_page_cache_func(struct work_struct *work) bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (bnode) { - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); + if (!bnode) + break; - if (!pushed) { - free_page((unsigned long) bnode); - break; - } + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; } } @@ -3338,7 +3357,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3371,7 +3390,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) atomic_set(&krcp->backoff_page_cache_fill, 1); } - return count; + return count == 0 ? SHRINK_EMPTY : count; } static unsigned long @@ -3414,49 +3433,27 @@ void __init kfree_rcu_scheduler_running(void) raw_spin_lock_irqsave(&krcp->lock, flags); if (need_offload_krc(krcp)) - schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPTION. + * implies a grace period. * - * However, because a context switch is a grace period for !PREEMPTION, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. + * Later on, this could in theory be the case for kernels built with + * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this + * is not a common case. Furthermore, this optimization would cause + * the rcu_gp_oldstate structure to expand by 50%, so this potential + * grace-period optimization is ignored once the scheduler is running. */ static int rcu_blocking_is_gp(void) { - int ret; - - // Invoking preempt_model_*() too early gets a splat. - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || - preempt_model_full() || preempt_model_rt()) - return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + return false; might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - /* - * If the rcu_state.n_online_cpus counter is equal to one, - * there is only one CPU, and that CPU sees all prior accesses - * made by any CPU that was online at the time of its access. - * Furthermore, if this counter is equal to one, its value cannot - * change until after the preempt_enable() below. - * - * Furthermore, if rcu_state.n_online_cpus is equal to one here, - * all later CPUs (both this one and any that come online later - * on) are guaranteed to see all accesses prior to this point - * in the code, without the need for additional memory barriers. - * Those memory barriers are provided by CPU-hotplug code. - */ - ret = READ_ONCE(rcu_state.n_online_cpus) <= 1; - preempt_enable(); - return ret; + return true; } /** @@ -3499,30 +3496,59 @@ static int rcu_blocking_is_gp(void) */ void synchronize_rcu(void) { + unsigned long flags; + struct rcu_node *rnp; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) { - // Note well that this code runs with !PREEMPT && !SMP. - // In addition, all code that advances grace periods runs at - // process level. Therefore, this normal GP overlaps with - // other normal GPs only by being fully nested within them, - // which allows reuse of ->gp_seq_polled_snap. - rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); - rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); - if (rcu_init_invoked()) - cond_resched_tasks_rcu_qs(); - return; // Context allows vacuous grace periods. + if (!rcu_blocking_is_gp()) { + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); + return; } - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); + + // Context allows vacuous grace periods. + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with other + // normal GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + + // Update the normal grace-period counters to record + // this grace period, but only those used by the boot CPU. + // The rcu_scheduler_starting() will take care of the rest of + // these counters. + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT); + for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(synchronize_rcu); /** + * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie + * @rgosp: Place to put state cookie + * + * Stores into @rgosp a value that will always be treated by functions + * like poll_state_synchronize_rcu_full() as a cookie whose grace period + * has already completed. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; + rgosp->rgos_exp = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/** * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() @@ -3541,21 +3567,42 @@ unsigned long get_state_synchronize_rcu(void) EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); /** - * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited + * @rgosp: location to place combined normal/expedited grace-period state * - * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * or poll_state_synchronize_rcu() to determine whether or not a full - * grace period has elapsed in the meantime. If the needed grace period - * is not already slated to start, notifies RCU core of the need for that - * grace period. + * Places the normal and expedited grace-period states in @rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * The rcu_gp_oldstate structure takes up twice the memory of an unsigned + * long, but is guaranteed to see all grace periods. In contrast, the + * combined state occupies less memory, but can sometimes fail to take + * grace periods into account. * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. + * This does not guarantee that the needed grace period will actually + * start. */ -unsigned long start_poll_synchronize_rcu(void) +void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + /* + * Any prior manipulation of RCU-protected data must happen + * before the loads from ->gp_seq and ->expedited_sequence. + */ + smp_mb(); /* ^^^ */ + rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); + +/* + * Helper function for start_poll_synchronize_rcu() and + * start_poll_synchronize_rcu_full(). + */ +static void start_poll_synchronize_rcu_common(void) { unsigned long flags; - unsigned long gp_seq = get_state_synchronize_rcu(); bool needwake; struct rcu_data *rdp; struct rcu_node *rnp; @@ -3575,17 +3622,57 @@ unsigned long start_poll_synchronize_rcu(void) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); +} + +/** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + start_poll_synchronize_rcu_common(); return gp_seq; } EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); /** - * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period + * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * + * Places the normal and expedited grace-period states in *@rgos. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed grace period is not already slated to start, notifies + * RCU core of the need for that grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + + start_poll_synchronize_rcu_common(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); + +/** + * poll_state_synchronize_rcu - Has the specified RCU grace period completed? * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call from - * which oldstate was obtained, return @true, otherwise return @false. + * which @oldstate was obtained, return @true, otherwise return @false. * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate @@ -3594,10 +3681,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for * more than a billion grace periods (and way more on a 64-bit system!). - * Those needing to keep oldstate values for very long time periods - * (many hours even on 32-bit systems) should check them occasionally - * and either refresh them or set a flag indicating that the grace period - * has completed. + * Those needing to keep old state values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally and + * either refresh them or set a flag indicating that the grace period has + * completed. Alternatively, they can use get_completed_synchronize_rcu() + * to get a guaranteed-completed grace-period state. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call @@ -3616,8 +3704,56 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** - * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed? + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * + * If a full RCU grace period has elapsed since the earlier call from + * which *rgosp was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibility to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @rgosp + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited + * for more than a billion grace periods (and way more on a 64-bit + * system!). Those needing to keep rcu_gp_oldstate values for very + * long time periods (many hours even on 32-bit systems) should check + * them occasionally and either refresh them or set a flag indicating + * that the grace period has completed. Alternatively, they can use + * get_completed_synchronize_rcu_full() to get a guaranteed-completed + * grace-period state. + * + * This function provides the same memory-ordering guarantees that would + * be provided by a synchronize_rcu() that was invoked at the call to + * the function that provided @rgosp, and that returned at the end of this + * function. And this guarantee requires that the root rcu_node structure's + * ->gp_seq field be checked instead of that of the rcu_state structure. + * The problem is that the just-ending grace-period's callbacks can be + * invoked between the time that the root rcu_node structure's ->gp_seq + * field is updated and the time that the rcu_state structure's ->gp_seq + * field is updated. Therefore, if a single synchronize_rcu() is to + * cause a subsequent poll_state_synchronize_rcu_full() to return @true, + * then the root rcu_node structure is the one that needs to be polled. + */ +bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + smp_mb(); // Order against root rcu_node structure grace-period cleanup. + if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) || + rgosp->rgos_exp == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to @@ -3641,6 +3777,33 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/** + * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu() to wait + * for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full); + /* * Check to see if there is any immediate RCU-related work to be done by * the current CPU, returning 1 if so and zero otherwise. The checks are @@ -4312,9 +4475,20 @@ early_initcall(rcu_spawn_gp_kthread); */ void rcu_scheduler_starting(void) { + unsigned long flags; + struct rcu_node *rnp; + WARN_ON(num_online_cpus() != 1); WARN_ON(nr_context_switches() > 0); rcu_test_sync_prims(); + + // Fix up the ->gp_seq counters. + local_irq_save(flags); + rcu_for_each_node_breadth_first(rnp) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); + + // Switch out of early boot mode. rcu_scheduler_active = RCU_SCHEDULER_INIT; rcu_test_sync_prims(); } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index be667583a554..18e9b4cd78ef 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; - if (rcu_is_cpu_rrupt_from_idle()) { + if (rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) { rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } @@ -906,6 +908,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) void synchronize_rcu_expedited(void) { bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); + unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -924,8 +927,11 @@ void synchronize_rcu_expedited(void) // them, which allows reuse of ->gp_seq_polled_exp_snap. rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); - if (rcu_init_invoked()) - cond_resched(); + + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT); + local_irq_restore(flags); return; // Context allows vacuous grace periods. } @@ -1028,6 +1034,24 @@ unsigned long start_poll_synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); /** + * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period + * @rgosp: Place to put snapshot of grace-period state + * + * Places the normal and expedited grace-period states in rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed expedited grace period is not already slated to start, + * initiates that grace period. + */ +void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + (void)start_poll_synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full); + +/** * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period * * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() @@ -1053,3 +1077,30 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate) synchronize_rcu_expedited(); } EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu_expedited() + * to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index a8f574d8850d..0a5f0ef41484 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1111,7 +1111,7 @@ int rcu_nocb_cpu_deoffload(int cpu) if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); ret = -EINVAL; } } @@ -1196,7 +1196,7 @@ int rcu_nocb_cpu_offload(int cpu) if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Can't CB-offload an offline CPU\n"); + pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); ret = -EINVAL; } } @@ -1452,8 +1452,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) (long)rdp->nocb_gp_seq, rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ @@ -1497,7 +1497,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 438ecae6bd7e..e3142ee35fc6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -641,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || (rdp->grpmask & READ_ONCE(rnp->expmask)) || - IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. @@ -718,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user) struct task_struct *t = current; lockdep_assert_irqs_disabled(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - rcu_note_voluntary_context_switch(current); - } if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ @@ -824,6 +822,7 @@ void rcu_read_unlock_strict(void) if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } @@ -869,7 +868,7 @@ void rcu_all_qs(void) if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) return; - preempt_disable(); + preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { preempt_enable(); @@ -931,10 +930,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) return false; } -// Except that we do need to respond to a request by an expedited grace -// period for a quiescent state from this CPU. Note that requests from -// tasks are handled when removing the task from the blocked-tasks list -// below. +// Except that we do need to respond to a request by an expedited +// grace period for a quiescent state from this CPU. Note that in +// non-preemptible kernels, there can be no context switches within RCU +// read-side critical sections, which in turn means that the leaf rcu_node +// structure's blocked-tasks list is always empty. is therefore no need to +// actually check it. Instead, a quiescent state from this CPU suffices, +// and this function is only called from such a quiescent state. notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -972,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user) * neither access nor modify, at least not while the * corresponding CPU is online. */ - rcu_qs(); } } @@ -1238,8 +1239,11 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) + if (cpumask_empty(cm)) { cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (outgoingcpu >= 0) + cpumask_clear_cpu(outgoingcpu, cm); + } set_cpus_allowed_ptr(t, cm); mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c3fbbcc09327..5653560573e2 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -368,7 +368,7 @@ static void rcu_dump_cpu_stacks(void) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); - else if (!trigger_single_cpu_backtrace(cpu)) + else dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -511,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); } else { pr_err("Stack dump where RCU GP kthread last ran:\n"); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + dump_cpu_task(cpu); } } wake_up_process(gpk); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee28253c9ac0..60fdc0faf1c9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <uapi/linux/sched/types.h> +#include <asm/irq_regs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -11183,6 +11184,19 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { + if (cpu == smp_processor_id() && in_hardirq()) { + struct pt_regs *regs; + + regs = get_irq_regs(); + if (regs) { + show_regs(regs); + return; + } + } + + if (trigger_single_cpu_backtrace(cpu)) + return; + pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1207c78f85c1..9161d1136d01 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -25,6 +25,9 @@ struct sugov_policy { unsigned int next_freq; unsigned int cached_raw_freq; + /* max CPU capacity, which is equal for all CPUs in freq. domain */ + unsigned long max; + /* The next fields are only needed if fast switch cannot be used: */ struct irq_work irq_work; struct kthread_work work; @@ -48,7 +51,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -158,7 +160,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), FREQUENCY_UTIL, NULL); @@ -253,6 +254,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, */ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long boost; /* No boost currently required */ @@ -280,7 +282,8 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = sg_cpu->iowait_boost * sg_policy->max; + boost >>= SCHED_CAPACITY_SHIFT; boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; @@ -337,7 +340,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, if (!sugov_update_single_common(sg_cpu, time, flags)) return; - next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -373,6 +376,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; /* @@ -399,7 +403,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), sg_cpu->max); + map_util_perf(sg_cpu->util), + sg_policy->max); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -408,25 +413,19 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0; unsigned int j; for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; sugov_get_util(j_sg_cpu); sugov_iowait_apply(j_sg_cpu, time); - j_util = j_sg_cpu->util; - j_max = j_sg_cpu->max; - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, sg_policy->max); } static void @@ -752,7 +751,7 @@ static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); - unsigned int cpu; + unsigned int cpu = cpumask_first(policy->cpus); sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; @@ -760,6 +759,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->work_in_progress = false; sg_policy->limits_changed = false; sg_policy->cached_raw_freq = 0; + sg_policy->max = arch_scale_cpu_capacity(cpu); sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bb3d63bdf4ae..667876da8382 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void) char buf[32]; snprintf(buf, sizeof(buf), "cpu%d", cpu); - debugfs_remove(debugfs_lookup(buf, sd_dentry)); + debugfs_lookup_and_remove(buf, sd_dentry); d_cpu = debugfs_create_dir(buf, sd_dentry); i = 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ec66b40bdd40..ecb4b4ff4ce0 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,12 +190,8 @@ static void group_init(struct psi_group *group) /* Init trigger-related members */ mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; init_waitqueue_head(&group->poll_wait); timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); @@ -957,7 +953,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL); + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); if (!cgroup->psi) return -ENOMEM; @@ -1091,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1320,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, nbytes, res); + new = psi_trigger_create(&psi_system, buf, res); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index d4788f810b55..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); diff --git a/kernel/smp.c b/kernel/smp.c index 650810a6f29b..e8cdc025a046 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -370,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * if (cpu >= 0) { if (static_branch_unlikely(&csdlock_debug_extended)) csd_lock_print_extended(csd, cpu); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a492f159624f..860b2dcf3ac4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bc921a3f7ea8..439e2ab6905e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,8 +1861,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } -static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); - /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -2974,6 +2972,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_startup_enable(command); + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + ops->flags &= ~FTRACE_OPS_FL_ADDING; return 0; @@ -3108,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the given ip. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_ip(struct ftrace_ops *ops, unsigned long ip) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) - return false; - - return true; -} - -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - return ops_references_ip(ops, rec->ip); -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -6812,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6824,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c1c47e2305ef..dacc37b62a2c 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -struct automaton_wip automaton_wip = { +static struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d1afe55cdd4c..118e576b91b4 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -struct automaton_wwnr automaton_wwnr = { +static struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index b698d05dd069..d65f6c25a87c 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -24,13 +24,13 @@ static struct rv_reactor rv_panic = { .react = rv_panic_reaction }; -static int register_react_panic(void) +static int __init register_react_panic(void) { rv_register_reactor(&rv_panic); return 0; } -static void unregister_react_panic(void) +static void __exit unregister_react_panic(void) { rv_unregister_reactor(&rv_panic); } diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 31899f953af4..4b6b7106a477 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -23,13 +23,13 @@ static struct rv_reactor rv_printk = { .react = rv_printk_reaction }; -static int register_react_printk(void) +static int __init register_react_printk(void) { rv_register_reactor(&rv_printk); return 0; } -static void unregister_react_printk(void) +static void __exit unregister_react_printk(void) { rv_unregister_reactor(&rv_printk); } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 4a0e9d927443..1783e3478912 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,6 +227,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) struct probe_arg *parg = &ep->tp.args[i]; struct ftrace_event_field *field; struct list_head *head; + int ret = -ENOENT; head = trace_get_fields(ep->event); list_for_each_entry(field, head, link) { @@ -236,9 +237,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) return 0; } } + + /* + * Argument not found on event. But allow for comm and COMM + * to be used to get the current->comm. + */ + if (strcmp(parg->code->data, "COMM") == 0 || + strcmp(parg->code->data, "comm") == 0) { + parg->code->op = FETCH_OP_COMM; + ret = 0; + } + kfree(parg->code->data); parg->code->data = NULL; - return -ENOENT; + return ret; } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -311,6 +323,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) addr = rec + field->offset; + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + switch (field->size) { case 1: if (field->is_signed) @@ -342,16 +375,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) static int get_eprobe_size(struct trace_probe *tp, void *rec) { + struct fetch_insn *code; struct probe_arg *arg; int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { + if (arg->dynamic) { unsigned long val; - val = get_event_field(arg->code, rec); - len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); if (len > 0) ret += len; } @@ -369,8 +424,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { unsigned long val; - val = get_event_field(code, rec); - return process_fetch_insn_bottom(code + 1, val, dest, base); + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) @@ -845,6 +920,10 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ trace_probe_log_err(0, BAD_ATTACH_ARG); } + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); + return ret; } @@ -883,7 +962,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); - if (!sys_event || !sys_name) { + if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a114549720d6..61e3a2620fa3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } @@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); @@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 181f08186d32..0356cae0cf74 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -176,6 +176,7 @@ static int trace_define_generic_fields(void) __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cb866c3141af..918730d74932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { if (data->flags & EVENT_TRIGGER_FL_PROBE) continue; return true; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 95b58bd757ce..1e130da1b742 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) } lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(CALLER_ADDR0); + lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 850a88abd33b..36dff277de46 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -283,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (strcmp(arg, "retval") == 0) { + if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; + } else if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; } else { @@ -307,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else goto inval_var; - } else if (strcmp(arg, "comm") == 0) { + } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == @@ -323,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif - } else if (flags & TPARG_FL_TPOINT) { - if (code->data) - return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -384,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ + if (flags & TPARG_FL_TPOINT) { + /* eprobes do not handle registers */ + trace_probe_log_err(offs, BAD_VAR); + break; + } ret = regs_query_register_offset(arg + 1); if (ret >= 0) { code->op = FETCH_OP_REG; @@ -617,9 +622,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* * Since $comm and immediate string can not be dereferenced, - * we can find those by strcmp. + * we can find those by strcmp. But ignore for eprobes. */ - if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { + if (!(flags & TPARG_FL_TPOINT) && + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 64ea283f2f86..ef42c1a11920 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,7 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE)); + (1 << TAINT_UNSIGNED_MODULE) | + (1 << TAINT_TEST)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); @@ -647,7 +648,7 @@ static int tracepoint_module_coming(struct module *mod) /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging, out-of-tree, and unsigned GPL modules are fine. + * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5481ba44a8d6..3f464bbda0e9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> +#include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> @@ -113,6 +114,10 @@ int create_user_ns(struct cred *new) !kgid_has_mapping(parent_ns, group)) goto fail_dec; + ret = security_create_user_ns(new); + if (ret < 0) + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aeea9731ef80..7cd5f5e7e0a1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1651,7 +1651,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); @@ -3066,10 +3066,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (!from_cancel) { - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - } + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); |