aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_fsnotify.c1
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/cgroup.c4
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c13
-rw-r--r--kernel/cfi.c352
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/cgroup.c85
-rw-r--r--kernel/cgroup/cpuset.c3
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/crash_core.c1
-rw-r--r--kernel/dma/debug.c6
-rw-r--r--kernel/dma/mapping.c3
-rw-r--r--kernel/dma/swiotlb.c13
-rw-r--r--kernel/events/core.c9
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/kallsyms.c43
-rw-r--r--kernel/kprobes.c10
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/livepatch/core.c4
-rw-r--r--kernel/module/main.c52
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/rcu/rcutorture.c290
-rw-r--r--kernel/rcu/srcutiny.c14
-rw-r--r--kernel/rcu/tasks.h5
-rw-r--r--kernel/rcu/tiny.c27
-rw-r--r--kernel/rcu/tree.c330
-rw-r--r--kernel/rcu/tree_exp.h57
-rw-r--r--kernel/rcu/tree_nocb.h10
-rw-r--r--kernel/rcu/tree_plugin.h26
-rw-r--r--kernel/rcu/tree_stall.h5
-rw-r--r--kernel/sched/core.c14
-rw-r--r--kernel/sched/cpufreq_schedutil.c30
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/psi.c10
-rw-r--r--kernel/sched/wait_bit.c2
-rw-r--r--kernel/smp.c3
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/trace/ftrace.c89
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h2
-rw-r--r--kernel/trace/rv/reactor_panic.c4
-rw-r--r--kernel/trace/rv/reactor_printk.c4
-rw-r--r--kernel/trace/trace_eprobe.c93
-rw-r--r--kernel/trace/trace_event_perf.c7
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_events_trigger.c3
-rw-r--r--kernel/trace/trace_preemptirq.c4
-rw-r--r--kernel/trace/trace_probe.c29
-rw-r--r--kernel/tracepoint.c5
-rw-r--r--kernel/user_namespace.c5
-rw-r--r--kernel/workqueue.c8
55 files changed, 1017 insertions, 690 deletions
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 6432a37ac1c9..c565fbf66ac8 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
if (ret < 0) {
+ audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 280b4720c7a0..9f8c05228d6d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1940,6 +1940,7 @@ void __audit_uring_exit(int success, long code)
goto out;
}
+ audit_return_fixup(ctx, success, code);
if (ctx->context == AUDIT_CTX_SYSCALL) {
/*
* NOTE: See the note in __audit_uring_entry() about the case
@@ -1981,7 +1982,6 @@ void __audit_uring_exit(int success, long code)
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
goto out;
- audit_return_fixup(ctx, success, code);
audit_log_exit();
out:
@@ -2065,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code)
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
+ audit_return_fixup(context, success, return_code);
/* run through both filters to ensure we set the filterkey properly */
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
if (context->current_state != AUDIT_STATE_RECORD)
goto out;
- audit_return_fixup(context, success, return_code);
audit_log_exit();
out:
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fa71d58b7ded..761998fda762 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -335,6 +335,7 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 59b7eb60d5b4..4a400cd63731 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -921,8 +921,10 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
pos++;
}
}
+
+ /* no link or prog match, skip the cgroup of this layer */
+ continue;
found:
- BUG_ON(!cg);
progs = rcu_dereference_protected(
desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1e10d088dbb..3d9eb3ae334c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -971,7 +971,7 @@ pure_initcall(bpf_jit_charge_init);
int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
if (!bpf_capable()) {
atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 85fa9dbfa8bf..82c61612f382 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk)
struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- socks = __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a4d40d98428a..27760627370d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5197,7 +5197,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return &bpf_sys_bpf_proto;
+ return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 096fdac70165..3eadb14e090b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6066,6 +6066,9 @@ skip_type_check:
return -EACCES;
}
meta->mem_size = reg->var_off.value;
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
break;
case ARG_PTR_TO_INT:
case ARG_PTR_TO_LONG:
@@ -7030,8 +7033,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
struct bpf_reg_state *regs = cur_regs(env), *reg;
struct bpf_map *map = meta->map_ptr;
- struct tnum range;
- u64 val;
+ u64 val, max;
int err;
if (func_id != BPF_FUNC_tail_call)
@@ -7041,10 +7043,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EINVAL;
}
- range = tnum_range(0, map->max_entries - 1);
reg = &regs[BPF_REG_3];
+ val = reg->var_off.value;
+ max = map->max_entries;
- if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+ if (!(register_is_const(reg) && val < max)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
@@ -7052,8 +7055,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
err = mark_chain_precision(env, BPF_REG_3);
if (err)
return err;
-
- val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
else if (!bpf_map_key_poisoned(aux) &&
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 2046276ee234..08caad776717 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -1,339 +1,101 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ * Clang Control Flow Integrity (CFI) error handling.
*
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
*/
-#include <linux/hardirq.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/set_memory.h>
+#include <linux/cfi.h>
-/* Compiler-defined handler names */
-#ifdef CONFIG_CFI_PERMISSIVE
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail
-#else
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
-#endif
-
-static inline void handle_cfi_failure(void *ptr)
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
{
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
- WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
else
- panic("CFI failure (target: %pS)\n", ptr);
-}
-
-#ifdef CONFIG_MODULES
-#ifdef CONFIG_CFI_CLANG_SHADOW
-/*
- * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
- * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
- */
-typedef u16 shadow_t;
-#define SHADOW_INVALID ((shadow_t)~0UL)
-
-struct cfi_shadow {
- /* Page index for the beginning of the shadow */
- unsigned long base;
- /* An array of __cfi_check locations (as indices to the shadow) */
- shadow_t shadow[1];
-} __packed;
-
-/*
- * The shadow covers ~128M from the beginning of the module region. If
- * the region is larger, we fall back to __module_address for the rest.
- */
-#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
-
-/* The in-memory size of struct cfi_shadow, always at least one page */
-#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
-#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
-#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
-
-/* The actual size of the shadow array, minus metadata */
-#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
-#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
-
-static DEFINE_MUTEX(shadow_update_lock);
-static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
-/* Returns the index in the shadow for the given address */
-static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
-{
- unsigned long index;
- unsigned long page = ptr >> PAGE_SHIFT;
-
- if (unlikely(page < s->base))
- return -1; /* Outside of module area */
-
- index = page - s->base;
-
- if (index >= SHADOW_ARR_SLOTS)
- return -1; /* Cannot be addressed with shadow */
-
- return (int)index;
-}
-
-/* Returns the page address for an index in the shadow */
-static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- return (s->base + index) << PAGE_SHIFT;
-}
-
-/* Returns the __cfi_check function address for the given shadow location */
-static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- if (unlikely(s->shadow[index] == SHADOW_INVALID))
- return 0;
-
- /* __cfi_check is always page aligned */
- return (s->base + s->shadow[index]) << PAGE_SHIFT;
-}
-
-static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
- struct cfi_shadow *next)
-{
- int i, index, check;
-
- /* Mark everything invalid */
- memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
-
- if (!prev)
- return; /* No previous shadow */
-
- /* If the base address didn't change, an update is not needed */
- if (prev->base == next->base) {
- memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
- return;
- }
-
- /* Convert the previous shadow to the new address range */
- for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
- if (prev->shadow[i] == SHADOW_INVALID)
- continue;
-
- index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
- if (index < 0)
- continue;
-
- check = ptr_to_shadow(next,
- shadow_to_check_fn(prev, prev->shadow[i]));
- if (check < 0)
- continue;
-
- next->shadow[index] = (shadow_t)check;
- }
-}
-
-static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
-{
- int check_index;
- unsigned long check = (unsigned long)mod->cfi_check;
- unsigned long ptr;
-
- if (unlikely(!PAGE_ALIGNED(check))) {
- pr_warn("cfi: not using shadow for module %s\n", mod->name);
- return;
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
}
- check_index = ptr_to_shadow(s, check);
- if (check_index < 0)
- return; /* Module not addressable with shadow */
-
- /* For each page, store the check function index in the shadow */
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0) {
- /* Each page must only contain one module */
- WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
- s->shadow[index] = (shadow_t)check_index;
- }
- }
+ return BUG_TRAP_TYPE_BUG;
}
-static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
{
- unsigned long ptr;
-
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0)
- s->shadow[index] = SHADOW_INVALID;
- }
+ return (unsigned long)((long)p + (long)*p);
}
-typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
- unsigned long min_addr, unsigned long max_addr);
-
-static void update_shadow(struct module *mod, unsigned long base_addr,
- update_shadow_fn fn)
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
{
- struct cfi_shadow *prev;
- struct cfi_shadow *next;
- unsigned long min_addr, max_addr;
-
- next = vmalloc(SHADOW_SIZE);
-
- mutex_lock(&shadow_update_lock);
- prev = rcu_dereference_protected(cfi_shadow,
- mutex_is_locked(&shadow_update_lock));
-
- if (next) {
- next->base = base_addr >> PAGE_SHIFT;
- prepare_next_shadow(prev, next);
+ s32 *p;
- min_addr = (unsigned long)mod->core_layout.base;
- max_addr = min_addr + mod->core_layout.text_size;
- fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
-
- set_memory_ro((unsigned long)next, SHADOW_PAGES);
- }
-
- rcu_assign_pointer(cfi_shadow, next);
- mutex_unlock(&shadow_update_lock);
- synchronize_rcu();
-
- if (prev) {
- set_memory_rw((unsigned long)prev, SHADOW_PAGES);
- vfree(prev);
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
}
-}
-void cfi_module_add(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, add_module_to_shadow);
+ return false;
}
-void cfi_module_remove(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, remove_module_from_shadow);
-}
-
-static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
- unsigned long ptr)
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
{
- int index;
-
- if (unlikely(!s))
- return NULL; /* No shadow available */
-
- index = ptr_to_shadow(s, ptr);
- if (index < 0)
- return NULL; /* Cannot be addressed with shadow */
+ char *secstrings;
+ unsigned int i;
- return (cfi_check_fn)shadow_to_check_fn(s, index);
-}
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn;
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
- rcu_read_lock_sched_notrace();
- fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched_notrace();
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- return fn;
-}
-
-#else /* !CONFIG_CFI_CLANG_SHADOW */
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- return NULL;
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
+ }
}
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+static bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = NULL;
struct module *mod;
+ bool found = false;
rcu_read_lock_sched_notrace();
- mod = __module_address(ptr);
- if (mod)
- fn = mod->cfi_check;
- rcu_read_unlock_sched_notrace();
-
- return fn;
-}
-
-static inline cfi_check_fn find_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn = NULL;
- unsigned long flags;
- bool rcu_idle;
-
- if (is_kernel_text(ptr))
- return __cfi_check;
- /*
- * Indirect call checks can happen when RCU is not watching. Both
- * the shadow and __module_address use RCU, so we need to wake it
- * up if necessary.
- */
- rcu_idle = !rcu_is_watching();
- if (rcu_idle) {
- local_irq_save(flags);
- ct_irq_enter();
- }
-
- if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
- fn = find_shadow_check_fn(ptr);
- if (!fn)
- fn = find_module_check_fn(ptr);
+ mod = __module_address(addr);
+ if (mod)
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
- if (rcu_idle) {
- ct_irq_exit();
- local_irq_restore(flags);
- }
+ rcu_read_unlock_sched_notrace();
- return fn;
+ return found;
}
-
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = find_check_fn((unsigned long)ptr);
-
- if (likely(fn))
- fn(id, ptr, diag);
- else /* Don't allow unchecked modules */
- handle_cfi_failure(ptr);
+ return false;
}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+#endif /* CONFIG_MODULES */
-#else /* !CONFIG_MODULES */
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+bool is_cfi_trap(unsigned long addr)
{
- handle_cfi_failure(ptr); /* No modules */
-}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
-#endif /* CONFIG_MODULES */
-
-void cfi_failure_handler(void *data, void *ptr, void *vtable)
-{
- handle_cfi_failure(ptr);
+ return is_module_cfi_trap(addr);
}
-EXPORT_SYMBOL(cfi_failure_handler);
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2ade21b54dc4..ff6a8099eb2a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
+ cpus_read_lock();
percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
break;
}
percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
mutex_unlock(&cgroup_mutex);
return retval;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ffaccd6373f1..5f2090d051ac 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1820,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
if (ss->css_rstat_flush) {
list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
list_add_rcu(&css->rstat_css_node,
&dcgrp->rstat_css_list);
}
@@ -2370,6 +2371,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+static void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+static void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2841,8 +2883,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2859,12 +2900,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2895,17 +2932,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2913,8 +2947,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -3000,8 +3034,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
- if (has_tasks)
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(has_tasks);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3022,8 +3055,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- if (has_tasks)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -3698,7 +3730,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
- new = psi_trigger_create(psi, buf, nbytes, res);
+ new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
@@ -4971,13 +5003,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -5003,7 +5035,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -6017,6 +6049,9 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!kn)
goto out;
+ if (kernfs_type(kn) != KERNFS_DIR)
+ goto put;
+
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
@@ -6024,7 +6059,7 @@ struct cgroup *cgroup_get_from_id(u64 id)
cgrp = NULL;
rcu_read_unlock();
-
+put:
kernfs_put(kn);
out:
return cgrp;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58aadfda9b8b..1f3a55297f39 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- cpus_read_lock();
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..38a7c5362c9c
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1 @@
+CONFIG_RUST=y
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 07b26df453a9..a0eb4d5cf557 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -494,6 +494,7 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_KALLSYMS
VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_num_syms);
VMCOREINFO_SYMBOL(kallsyms_token_table);
VMCOREINFO_SYMBOL(kallsyms_token_index);
#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 2caafd13f8aa..18c93c2276ca 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -350,11 +350,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
unsigned long *flags)
{
- unsigned int max_range = dma_get_max_seg_size(ref->dev);
struct dma_debug_entry *entry, index = *ref;
- unsigned int range = 0;
+ int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1);
- while (range <= max_range) {
+ for (int i = 0; i < limit; i++) {
entry = __hash_bucket_find(*bucket, ref, containing_match);
if (entry)
@@ -364,7 +363,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
* Nothing found, go back a hash bucket
*/
put_hash_bucket(*bucket, *flags);
- range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 49cbf3e33de7..27f272381cf2 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -707,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
-int dma_supported(struct device *dev, u64 mask)
+static int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -721,7 +721,6 @@ int dma_supported(struct device *dev, u64 mask)
return 1;
return ops->dma_supported(dev, mask);
}
-EXPORT_SYMBOL(dma_supported);
bool dma_pci_p2pdma_supported(struct device *dev)
{
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c5a9190b218f..0ef6b12f961d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -326,9 +326,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
swiotlb_adjust_nareas(num_possible_cpus());
nslabs = default_nslabs;
- if (nslabs < IO_TLB_MIN_SLABS)
- panic("%s: nslabs = %lu too small\n", __func__, nslabs);
-
/*
* By default allocate the bounce buffer memory from low memory, but
* allow to pick a location everywhere for hypervisors with guest
@@ -341,8 +338,7 @@ retry:
else
tlb = memblock_alloc_low(bytes, PAGE_SIZE);
if (!tlb) {
- pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
- __func__, bytes);
+ pr_warn("%s: failed to allocate tlb structure\n", __func__);
return;
}
@@ -579,7 +575,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
}
-#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
+{
+ return start + (idx << IO_TLB_SHIFT);
+}
/*
* Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
@@ -765,7 +764,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
/*
* When dir == DMA_FROM_DEVICE we could omit the copy from the orig
* to the tlb buffer, if we knew for sure the device will
- * overwirte the entire current content. But we don't. Thus
+ * overwrite the entire current content. But we don't. Thus
* unconditional bounce may prevent leaking swiotlb content (i.e.
* kernel memory) to user-space.
*/
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2621fd24ad26..ff4bffc502c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6893,9 +6893,16 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
+ unsigned long flags;
u64 values[6];
int n = 0;
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -6931,6 +6938,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
diff --git a/kernel/fork.c b/kernel/fork.c
index 90c85b17bf69..2b6bd511c6ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1225,6 +1225,7 @@ void mmput_async(struct mm_struct *mm)
schedule_work(&mm->async_put_work);
}
}
+EXPORT_SYMBOL_GPL(mmput_async);
#endif
/**
@@ -2046,11 +2047,8 @@ static __latent_entropy struct task_struct *copy_process(
/*
* If the new process will be in a different time namespace
* do not allow it to share VM or a thread group with the forking task.
- *
- * On vfork, the child process enters the target time namespace only
- * after exec.
*/
- if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) {
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3e7e2c2ad2f7..60c20f301a6b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -50,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
data = &kallsyms_names[off];
len = *data;
data++;
+ off++;
+
+ /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7F) | (*data << 7);
+ data++;
+ off++;
+ }
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
- off += len + 1;
+ off += len;
/*
* For every byte on the compressed symbol data, copy the table
@@ -108,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
- int i;
+ int i, len;
/*
* Use the closest marker we have. We have markers every 256 positions,
@@ -122,8 +130,18 @@ static unsigned int get_symbol_offset(unsigned long pos)
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
- for (i = 0; i < (pos & 0xFF); i++)
- name = name + (*name) + 1;
+ for (i = 0; i < (pos & 0xFF); i++) {
+ len = *name;
+
+ /*
+ * If MSB is 1, it is a "big" symbol, so we need to look into
+ * the next byte (and skip it, too).
+ */
+ if ((len & 0x80) != 0)
+ len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+ name = name + len + 1;
+ }
return name - kallsyms_names;
}
@@ -159,7 +177,6 @@ static bool cleanup_symbol_name(char *s)
* character in an identifier in C. Suffixes observed:
* - foo.llvm.[0-9a-f]+
* - foo.[0-9a-f]+
- * - foo.[0-9a-f]+.cfi_jt
*/
res = strchr(s, '.');
if (res) {
@@ -167,22 +184,6 @@ static bool cleanup_symbol_name(char *s)
return true;
}
- if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
- !IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
- CONFIG_CLANG_VERSION >= 130000)
- return false;
-
- /*
- * Prior to LLVM 13, the following suffixes were observed when thinLTO
- * and CFI are both enabled:
- * - foo$[0-9]+
- */
- res = strrchr(s, '$');
- if (res) {
- *res = '\0';
- return true;
- }
-
return false;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 80697e5e03e4..ca9d834d0b84 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1562,6 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!(core_kernel_text((unsigned long) p->addr) ||
is_module_text_address((unsigned long) p->addr)) ||
+ in_gate_area_no_mm((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
@@ -1707,11 +1708,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
/*
- * If 'kprobes_all_disarmed' is set, 'orig_p'
- * should have already been disarmed, so
- * skip unneed disarming process.
+ * Don't be lazy here. Even if 'kprobes_all_disarmed'
+ * is false, 'orig_p' might not have been armed yet.
+ * Note arm_all_kprobes() __tries__ to arm all kprobes
+ * on the best effort basis.
*/
- if (!kprobes_all_disarmed) {
+ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
ret = disarm_kprobe(orig_p, true);
if (ret) {
p->flags &= ~KPROBE_FLAG_DISABLED;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3c677918d8f2..28a6b7ab4a0f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1050,8 +1050,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_FUNCTION_MISMATCH(timer->function,
- kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index bc475e62279d..ec06ce59d728 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -213,7 +213,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
* we use the smallest/strictest upper bound possible (56, based on
* the current definition of MODULE_NAME_LEN) to prevent overflows.
*/
- BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
@@ -227,7 +227,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
/* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
- ".klp.sym.%55[^.].%127[^,],%lu",
+ ".klp.sym.%55[^.].%511[^,],%lu",
sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 6a477c622544..70c0b2c6fef8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -53,6 +53,7 @@
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
+#include <linux/cfi.h>
#include <uapi/linux/module.h>
#include "internal.h"
@@ -1144,8 +1145,6 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
-static void cfi_cleanup(struct module *mod);
-
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1190,9 +1189,6 @@ static void free_module(struct module *mod)
mod->name);
mutex_unlock(&module_mutex);
- /* Clean up CFI for the module. */
- cfi_cleanup(mod);
-
/* This may be empty, but that's OK */
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
@@ -2099,7 +2095,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->static_call_sites),
&mod->num_static_call_sites);
#endif
-#ifdef CONFIG_KUNIT
+#if IS_ENABLED(CONFIG_KUNIT)
mod->kunit_suites = section_objs(info, ".kunit_test_suites",
sizeof(*mod->kunit_suites),
&mod->num_kunit_suites);
@@ -2602,8 +2598,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
if (err < 0)
goto out;
- /* This relies on module_mutex for list integrity. */
+ /* These rely on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
+ module_cfi_finalize(info->hdr, info->sechdrs, mod);
if (module_check_misalignment(mod))
goto out_misaligned;
@@ -2665,8 +2662,6 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
-static void cfi_init(struct module *mod);
-
/*
* Allocate and load the module: note that size of section 0 is always
* zero, and we rely on this for optional sections.
@@ -2796,9 +2791,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
flush_module_icache(mod);
- /* Setup CFI for the module. */
- cfi_init(mod);
-
/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
@@ -2875,7 +2867,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
- cfi_cleanup(mod);
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
@@ -2961,41 +2952,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
return ((void *)addr >= start && (void *)addr < start + size);
}
-static void cfi_init(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- initcall_t *init;
-#ifdef CONFIG_MODULE_UNLOAD
- exitcall_t *exit;
-#endif
-
- rcu_read_lock_sched();
- mod->cfi_check = (cfi_check_fn)
- find_kallsyms_symbol_value(mod, "__cfi_check");
- init = (initcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
- /* Fix init/exit functions to point to the CFI jump table */
- if (init)
- mod->init = *init;
-#ifdef CONFIG_MODULE_UNLOAD
- exit = (exitcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
- if (exit)
- mod->exit = *exit;
-#endif
- rcu_read_unlock_sched();
-
- cfi_module_add(mod, mod_tree.addr_min);
-#endif
-}
-
-static void cfi_cleanup(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
- cfi_module_remove(mod, mod_tree.addr_min);
-#endif
-}
-
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b4cbb406bc28..eec72ca962e2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -179,8 +179,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- if ((flags & CLONE_VM) == 0)
- timens_on_fork(new_ns, tsk);
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d8e1b270a065..503c2aa845a4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -84,10 +84,15 @@ torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress test
torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+ "Use conditional/async full-stateexpedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -194,16 +199,24 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_GET_EXP 6
-#define RTWS_COND_SYNC 7
-#define RTWS_COND_SYNC_EXP 8
-#define RTWS_POLL_GET 9
-#define RTWS_POLL_GET_EXP 10
-#define RTWS_POLL_WAIT 11
-#define RTWS_POLL_WAIT_EXP 12
-#define RTWS_SYNC 13
-#define RTWS_STUTTER 14
-#define RTWS_STOPPING 15
+#define RTWS_COND_GET_FULL 6
+#define RTWS_COND_GET_EXP 7
+#define RTWS_COND_GET_EXP_FULL 8
+#define RTWS_COND_SYNC 9
+#define RTWS_COND_SYNC_FULL 10
+#define RTWS_COND_SYNC_EXP 11
+#define RTWS_COND_SYNC_EXP_FULL 12
+#define RTWS_POLL_GET 13
+#define RTWS_POLL_GET_FULL 14
+#define RTWS_POLL_GET_EXP 15
+#define RTWS_POLL_GET_EXP_FULL 16
+#define RTWS_POLL_WAIT 17
+#define RTWS_POLL_WAIT_FULL 18
+#define RTWS_POLL_WAIT_EXP 19
+#define RTWS_POLL_WAIT_EXP_FULL 20
+#define RTWS_SYNC 21
+#define RTWS_STUTTER 22
+#define RTWS_STOPPING 23
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -211,13 +224,21 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_FULL",
"RTWS_COND_GET_EXP",
+ "RTWS_COND_GET_EXP_FULL",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_FULL",
"RTWS_COND_SYNC_EXP",
+ "RTWS_COND_SYNC_EXP_FULL",
"RTWS_POLL_GET",
+ "RTWS_POLL_GET_FULL",
"RTWS_POLL_GET_EXP",
+ "RTWS_POLL_GET_EXP_FULL",
"RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_FULL",
"RTWS_POLL_WAIT_EXP",
+ "RTWS_POLL_WAIT_EXP_FULL",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -332,13 +353,21 @@ struct rcu_torture_ops {
void (*exp_sync)(void);
unsigned long (*get_gp_state_exp)(void);
unsigned long (*start_gp_poll_exp)(void);
+ void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state_exp)(unsigned long oldstate);
void (*cond_sync_exp)(unsigned long oldstate);
+ void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*get_gp_state)(void);
+ void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*get_gp_completed)(void);
+ void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*start_gp_poll)(void);
+ void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state)(unsigned long oldstate);
+ bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
+ void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -489,6 +518,11 @@ static void rcu_sync_torture_init(void)
INIT_LIST_HEAD(&rcu_torture_removed);
}
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
+{
+ return poll;
+}
+
static struct rcu_torture_ops rcu_ops = {
.ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
@@ -502,12 +536,19 @@ static struct rcu_torture_ops rcu_ops = {
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.get_gp_state = get_state_synchronize_rcu,
+ .get_gp_state_full = get_state_synchronize_rcu_full,
.get_gp_completed = get_completed_synchronize_rcu,
+ .get_gp_completed_full = get_completed_synchronize_rcu_full,
.start_gp_poll = start_poll_synchronize_rcu,
+ .start_gp_poll_full = start_poll_synchronize_rcu_full,
.poll_gp_state = poll_state_synchronize_rcu,
+ .poll_gp_state_full = poll_state_synchronize_rcu_full,
+ .poll_need_2gp = rcu_poll_need_2gp,
.cond_sync = cond_synchronize_rcu,
+ .cond_sync_full = cond_synchronize_rcu_full,
.get_gp_state_exp = get_state_synchronize_rcu,
.start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
.poll_gp_state_exp = poll_state_synchronize_rcu,
.cond_sync_exp = cond_synchronize_rcu_expedited,
.call = call_rcu,
@@ -709,6 +750,9 @@ static struct rcu_torture_ops srcud_ops = {
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
@@ -1148,15 +1192,35 @@ static int nsynctypes;
*/
static void rcu_torture_write_types(void)
{
- bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
- bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
- bool gp_sync1 = gp_sync;
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+ bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+ bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
- !gp_normal1 && !gp_poll1 && !gp_sync1)
- gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
- gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (!gp_cond1 &&
+ !gp_cond_exp1 &&
+ !gp_cond_full1 &&
+ !gp_cond_exp_full1 &&
+ !gp_exp1 &&
+ !gp_poll_exp1 &&
+ !gp_poll_exp_full1 &&
+ !gp_normal1 &&
+ !gp_poll1 &&
+ !gp_poll_full1 &&
+ !gp_sync1) {
+ gp_cond1 = true;
+ gp_cond_exp1 = true;
+ gp_cond_full1 = true;
+ gp_cond_exp_full1 = true;
+ gp_exp1 = true;
+ gp_poll_exp1 = true;
+ gp_poll_exp_full1 = true;
+ gp_normal1 = true;
+ gp_poll1 = true;
+ gp_poll_full1 = true;
+ gp_sync1 = true;
+ }
if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
@@ -1169,6 +1233,19 @@ static void rcu_torture_write_types(void)
} else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
}
+ if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+ pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+ } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+ pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+ }
+ if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+ pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+ } else if (gp_cond_exp_full &&
+ (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+ pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1187,12 +1264,25 @@ static void rcu_torture_write_types(void)
} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
pr_alert("%s: gp_poll without primitives.\n", __func__);
}
+ if (gp_poll_full1 && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+ pr_info("%s: Testing polling full-state GPs.\n", __func__);
+ } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+ }
if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
pr_info("%s: Testing polling expedited GPs.\n", __func__);
} else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
}
+ if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+ pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+ } else if (gp_poll_exp_full &&
+ (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
@@ -1202,6 +1292,40 @@ static void rcu_torture_write_types(void)
}
/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs. Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ bool dopoll;
+ bool dopoll_full;
+ unsigned long r = torture_random(trsp);
+
+ dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+ dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+ if (dopoll || dopoll_full)
+ cpus_read_lock();
+ if (dopoll)
+ cookie = cur_ops->get_gp_state();
+ if (dopoll_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
+ sync();
+ sync();
+ WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %pS() online %*pbl.",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 4 failed %pS() online %*pbl",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ if (dopoll || dopoll_full)
+ cpus_read_unlock();
+}
+
+/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
* after a series of grace periods (the "pipeline").
@@ -1212,8 +1336,10 @@ rcu_torture_writer(void *arg)
bool boot_ended;
bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int expediting = 0;
unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
int i;
int idx;
int oldnice = task_nice(current);
@@ -1261,11 +1387,12 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+
+ // Make sure readers block polled grace periods.
if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
idx = cur_ops->readlock();
cookie = cur_ops->get_gp_state();
- WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
- cur_ops->poll_gp_state(cookie),
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
"%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
__func__,
rcu_torture_writer_state_getname(),
@@ -1277,6 +1404,21 @@ rcu_torture_writer(void *arg)
}
cur_ops->readunlock(idx);
}
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+ idx = cur_ops->readlock();
+ cur_ops->get_gp_state_full(&cookie_full);
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ if (cur_ops->get_gp_completed_full) {
+ cur_ops->get_gp_completed_full(&cookie_full);
+ WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+ }
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1284,12 +1426,7 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
- cur_ops->exp_sync();
- cur_ops->exp_sync();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ do_rtws_sync(&rand, cur_ops->exp_sync);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
@@ -1308,6 +1445,22 @@ rcu_torture_writer(void *arg)
cur_ops->cond_sync_exp(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+ cur_ops->cond_sync_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
gp_snap = cur_ops->start_gp_poll();
@@ -1317,6 +1470,15 @@ rcu_torture_writer(void *arg)
&rand);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_POLL_GET_EXP:
rcu_torture_writer_state = RTWS_POLL_GET_EXP;
gp_snap = cur_ops->start_gp_poll_exp();
@@ -1326,14 +1488,18 @@ rcu_torture_writer(void *arg)
&rand);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
- cur_ops->sync();
- cur_ops->sync();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ do_rtws_sync(&rand, cur_ops->sync);
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1400,6 +1566,7 @@ static int
rcu_torture_fakewriter(void *arg)
{
unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
@@ -1438,6 +1605,16 @@ rcu_torture_fakewriter(void *arg)
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync_exp(gp_snap);
break;
+ case RTWS_COND_GET_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_full(&gp_snap_full);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ break;
case RTWS_POLL_GET:
gp_snap = cur_ops->start_gp_poll();
while (!cur_ops->poll_gp_state(gp_snap)) {
@@ -1445,6 +1622,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_FULL:
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_POLL_GET_EXP:
gp_snap = cur_ops->start_gp_poll_exp();
while (!cur_ops->poll_gp_state_exp(gp_snap)) {
@@ -1452,6 +1636,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_EXP_FULL:
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_SYNC:
cur_ops->sync();
break;
@@ -1715,7 +1906,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
*/
static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ bool checkpolling = !(torture_random(trsp) & 0xfff);
unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int i;
unsigned long started;
unsigned long completed;
@@ -1731,8 +1924,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ }
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
@@ -1766,13 +1963,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ONCE(cur_ops->poll_gp_state(cookie),
- "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
- __func__,
- rcu_torture_writer_state_getname(),
- rcu_torture_writer_state,
- cookie, cur_ops->get_gp_state());
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ }
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
@@ -2600,12 +2806,12 @@ static int rcutorture_oom_notify(struct notifier_block *self,
for (i = 0; i < fwd_progress; i++)
ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
- rcu_barrier();
+ cur_ops->cb_barrier();
ncbs = 0;
for (i = 0; i < fwd_progress; i++)
ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
- rcu_barrier();
+ cur_ops->cb_barrier();
ncbs = 0;
for (i = 0; i < fwd_progress; i++)
ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 92c002d65482..33adafdad261 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
- unsigned short cookie;
+ unsigned long cookie;
cookie = get_state_synchronize_srcu(ssp);
- if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
return;
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
@@ -215,7 +215,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
barrier();
ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
barrier();
- return ret & USHRT_MAX;
+ return ret;
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -240,10 +240,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+ unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ret;
+ return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 83c7e6620d40..f5bf6fb430da 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
/* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
// If the grace-period kthread is running, use it.
@@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
rcu_read_unlock();
+ cond_resched_tasks_rcu_qs();
}
// Only after all running tasks have been accounted for is it
@@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
}
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list is populated.
@@ -1619,6 +1621,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list scan has completed.
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f0561ee16b9c..a33a8d4942c3 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -158,6 +158,10 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+static void tiny_rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+
+ if (!__is_kvfree_rcu_offset((unsigned long)head->func))
+ WRITE_ONCE(head->func, tiny_rcu_leak_callback);
+ return;
+ }
+
head->func = func;
head->next = NULL;
@@ -184,6 +199,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
EXPORT_SYMBOL_GPL(call_rcu);
/*
+ * Store a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/*
* Return a grace-period-counter "cookie". For more information,
* see the Tree RCU header comment.
*/
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79aea7df4345..6bb8e72bc815 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -76,6 +76,7 @@
/* Data structures. */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
+ .gpwrap = true,
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_RCU_CORE,
#endif
@@ -1755,6 +1756,8 @@ static noinline void rcu_gp_cleanup(void)
dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+ if (!rnp->parent)
+ smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
rdp = this_cpu_ptr(&rcu_data);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rnp, rdp) || needgp;
@@ -2341,8 +2344,8 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
- if (user)
- rcu_tasks_classic_qs(current, false);
+ if (user || rcu_is_cpu_rrupt_from_idle())
+ rcu_note_voluntary_context_switch(current);
lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
@@ -2832,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
#define FREE_N_CHANNELS 2
@@ -3093,6 +3096,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
return !!krcp->head;
}
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+ return;
+ }
+ queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+}
+
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@@ -3150,7 +3168,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// work to repeat an attempt. Because previous batches are
// still in progress.
if (need_offload_krc(krcp))
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_monitor_work(krcp);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
@@ -3183,15 +3201,16 @@ static void fill_page_cache_func(struct work_struct *work)
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (bnode) {
- raw_spin_lock_irqsave(&krcp->lock, flags);
- pushed = put_cached_bnode(krcp, bnode);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (!bnode)
+ break;
- if (!pushed) {
- free_page((unsigned long) bnode);
- break;
- }
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
}
}
@@ -3338,7 +3357,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_monitor_work(krcp);
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -3371,7 +3390,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
atomic_set(&krcp->backoff_page_cache_fill, 1);
}
- return count;
+ return count == 0 ? SHRINK_EMPTY : count;
}
static unsigned long
@@ -3414,49 +3433,27 @@ void __init kfree_rcu_scheduler_running(void)
raw_spin_lock_irqsave(&krcp->lock, flags);
if (need_offload_krc(krcp))
- schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_monitor_work(krcp);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPTION.
+ * implies a grace period.
*
- * However, because a context switch is a grace period for !PREEMPTION, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
static int rcu_blocking_is_gp(void)
{
- int ret;
-
- // Invoking preempt_model_*() too early gets a splat.
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
- preempt_model_full() || preempt_model_rt())
- return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ return false;
might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- /*
- * If the rcu_state.n_online_cpus counter is equal to one,
- * there is only one CPU, and that CPU sees all prior accesses
- * made by any CPU that was online at the time of its access.
- * Furthermore, if this counter is equal to one, its value cannot
- * change until after the preempt_enable() below.
- *
- * Furthermore, if rcu_state.n_online_cpus is equal to one here,
- * all later CPUs (both this one and any that come online later
- * on) are guaranteed to see all accesses prior to this point
- * in the code, without the need for additional memory barriers.
- * Those memory barriers are provided by CPU-hotplug code.
- */
- ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
- preempt_enable();
- return ret;
+ return true;
}
/**
@@ -3499,30 +3496,59 @@ static int rcu_blocking_is_gp(void)
*/
void synchronize_rcu(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp()) {
- // Note well that this code runs with !PREEMPT && !SMP.
- // In addition, all code that advances grace periods runs at
- // process level. Therefore, this normal GP overlaps with
- // other normal GPs only by being fully nested within them,
- // which allows reuse of ->gp_seq_polled_snap.
- rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
- rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
- if (rcu_init_invoked())
- cond_resched_tasks_rcu_qs();
- return; // Context allows vacuous grace periods.
+ if (!rcu_blocking_is_gp()) {
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
+ return;
}
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
+
+ // Context allows vacuous grace periods.
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with other
+ // normal GPs only by being fully nested within them, which allows
+ // reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+ // Update the normal grace-period counters to record
+ // this grace period, but only those used by the boot CPU.
+ // The rcu_scheduler_starting() will take care of the rest of
+ // these counters.
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+ for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
+ *
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+ rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
* Returns a cookie that is used by a later call to cond_synchronize_rcu()
@@ -3541,21 +3567,42 @@ unsigned long get_state_synchronize_rcu(void)
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
/**
- * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
*
- * Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * or poll_state_synchronize_rcu() to determine whether or not a full
- * grace period has elapsed in the meantime. If the needed grace period
- * is not already slated to start, notifies RCU core of the need for that
- * grace period.
+ * Places the normal and expedited grace-period states in @rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods. In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
*
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
+ * This does not guarantee that the needed grace period will actually
+ * start.
*/
-unsigned long start_poll_synchronize_rcu(void)
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the loads from ->gp_seq and ->expedited_sequence.
+ */
+ smp_mb(); /* ^^^ */
+ rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+ rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
+
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
+ */
+static void start_poll_synchronize_rcu_common(void)
{
unsigned long flags;
- unsigned long gp_seq = get_state_synchronize_rcu();
bool needwake;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -3575,17 +3622,57 @@ unsigned long start_poll_synchronize_rcu(void)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake();
+}
+
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime. If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ start_poll_synchronize_rcu_common();
return gp_seq;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
/**
- * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
*
+ * Places the normal and expedited grace-period states in *@rgos. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+
+ start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
+
+/**
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
*
* If a full RCU grace period has elapsed since the earlier call from
- * which oldstate was obtained, return @true, otherwise return @false.
+ * which @oldstate was obtained, return @true, otherwise return @false.
* If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
@@ -3594,10 +3681,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
* more than a billion grace periods (and way more on a 64-bit system!).
- * Those needing to keep oldstate values for very long time periods
- * (many hours even on 32-bit systems) should check them occasionally
- * and either refresh them or set a flag indicating that the grace period
- * has completed.
+ * Those needing to keep old state values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed. Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
@@ -3616,8 +3704,56 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
*
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!). Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed. Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
+ *
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function. And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated. Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+ if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+ rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
@@ -3641,6 +3777,33 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
+
/*
* Check to see if there is any immediate RCU-related work to be done by
* the current CPU, returning 1 if so and zero otherwise. The checks are
@@ -4312,9 +4475,20 @@ early_initcall(rcu_spawn_gp_kthread);
*/
void rcu_scheduler_starting(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
rcu_test_sync_prims();
+
+ // Fix up the ->gp_seq counters.
+ local_irq_save(flags);
+ rcu_for_each_node_breadth_first(rnp)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
+
+ // Switch out of early boot mode.
rcu_scheduler_active = RCU_SCHEDULER_INIT;
rcu_test_sync_prims();
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index be667583a554..18e9b4cd78ef 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
- if (rcu_is_cpu_rrupt_from_idle()) {
+ if (rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
@@ -906,6 +908,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -924,8 +927,11 @@ void synchronize_rcu_expedited(void)
// them, which allows reuse of ->gp_seq_polled_exp_snap.
rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
- if (rcu_init_invoked())
- cond_resched();
+
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+ local_irq_restore(flags);
return; // Context allows vacuous grace periods.
}
@@ -1028,6 +1034,24 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+ (void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
+/**
* cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
*
* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
@@ -1053,3 +1077,30 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate)
synchronize_rcu_expedited();
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index a8f574d8850d..0a5f0ef41484 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1111,7 +1111,7 @@ int rcu_nocb_cpu_deoffload(int cpu)
if (!ret)
cpumask_clear_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
+ pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
@@ -1196,7 +1196,7 @@ int rcu_nocb_cpu_offload(int cpu)
if (!ret)
cpumask_set_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Can't CB-offload an offline CPU\n");
+ pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
@@ -1452,8 +1452,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
(long)rdp->nocb_gp_seq,
rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+ rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
}
/* Dump out nocb kthread state for the specified rcu_data structure. */
@@ -1497,7 +1497,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
rcu_segcblist_n_cbs(&rdp->cblist),
rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
/* It is OK for GP kthreads to have GP state. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 438ecae6bd7e..e3142ee35fc6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -641,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
(rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
+ (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
(IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
@@ -718,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user)
struct task_struct *t = current;
lockdep_assert_irqs_disabled();
- if (user || rcu_is_cpu_rrupt_from_idle()) {
- rcu_note_voluntary_context_switch(current);
- }
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
@@ -824,6 +822,7 @@ void rcu_read_unlock_strict(void)
if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
return;
rdp = this_cpu_ptr(&rcu_data);
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
}
@@ -869,7 +868,7 @@ void rcu_all_qs(void)
if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
return;
- preempt_disable();
+ preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
preempt_enable();
@@ -931,10 +930,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
return false;
}
-// Except that we do need to respond to a request by an expedited grace
-// period for a quiescent state from this CPU. Note that requests from
-// tasks are handled when removing the task from the blocked-tasks list
-// below.
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU. Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty. is therefore no need to
+// actually check it. Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -972,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user)
* neither access nor modify, at least not while the
* corresponding CPU is online.
*/
-
rcu_qs();
}
}
@@ -1238,8 +1239,11 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm))
+ if (cpumask_empty(cm)) {
cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
set_cpus_allowed_ptr(t, cm);
mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c3fbbcc09327..5653560573e2 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -368,7 +368,7 @@ static void rcu_dump_cpu_stacks(void)
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
if (cpu_is_offline(cpu))
pr_err("Offline CPU %d blocking current GP.\n", cpu);
- else if (!trigger_single_cpu_backtrace(cpu))
+ else
dump_cpu_task(cpu);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -511,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
} else {
pr_err("Stack dump where RCU GP kthread last ran:\n");
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ dump_cpu_task(cpu);
}
}
wake_up_process(gpk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..60fdc0faf1c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <uapi/linux/sched/types.h>
+#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -11183,6 +11184,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
+ if (cpu == smp_processor_id() && in_hardirq()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..9161d1136d01 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -25,6 +25,9 @@ struct sugov_policy {
unsigned int next_freq;
unsigned int cached_raw_freq;
+ /* max CPU capacity, which is equal for all CPUs in freq. domain */
+ unsigned long max;
+
/* The next fields are only needed if fast switch cannot be used: */
struct irq_work irq_work;
struct kthread_work work;
@@ -48,7 +51,6 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
- unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -158,7 +160,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
@@ -253,6 +254,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
*/
static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long boost;
/* No boost currently required */
@@ -280,7 +282,8 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = sg_cpu->iowait_boost * sg_policy->max;
+ boost >>= SCHED_CAPACITY_SHIFT;
boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
@@ -337,7 +340,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
if (!sugov_update_single_common(sg_cpu, time, flags))
return;
- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -373,6 +376,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long prev_util = sg_cpu->util;
/*
@@ -399,7 +403,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), sg_cpu->max);
+ map_util_perf(sg_cpu->util),
+ sg_policy->max);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -408,25 +413,19 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0;
unsigned int j;
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
sugov_iowait_apply(j_sg_cpu, time);
- j_util = j_sg_cpu->util;
- j_max = j_sg_cpu->max;
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, sg_policy->max);
}
static void
@@ -752,7 +751,7 @@ static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
- unsigned int cpu;
+ unsigned int cpu = cpumask_first(policy->cpus);
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
@@ -760,6 +759,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
sg_policy->cached_raw_freq = 0;
+ sg_policy->max = arch_scale_cpu_capacity(cpu);
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index bb3d63bdf4ae..667876da8382 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void)
char buf[32];
snprintf(buf, sizeof(buf), "cpu%d", cpu);
- debugfs_remove(debugfs_lookup(buf, sd_dentry));
+ debugfs_lookup_and_remove(buf, sd_dentry);
d_cpu = debugfs_create_dir(buf, sd_dentry);
i = 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ec66b40bdd40..ecb4b4ff4ce0 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -190,12 +190,8 @@ static void group_init(struct psi_group *group)
/* Init trigger-related members */
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
- memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
- group->poll_states = 0;
group->poll_min_period = U32_MAX;
- memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
- group->polling_until = 0;
init_waitqueue_head(&group->poll_wait);
timer_setup(&group->poll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->poll_task, NULL);
@@ -957,7 +953,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return 0;
- cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL);
+ cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
if (!cgroup->psi)
return -ENOMEM;
@@ -1091,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, size_t nbytes, enum psi_res res)
+ char *buf, enum psi_res res)
{
struct psi_trigger *t;
enum psi_states state;
@@ -1320,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
return -EBUSY;
}
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ new = psi_trigger_create(&psi_system, buf, res);
if (IS_ERR(new)) {
mutex_unlock(&seq->lock);
return PTR_ERR(new);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
diff --git a/kernel/smp.c b/kernel/smp.c
index 650810a6f29b..e8cdc025a046 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -370,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
if (cpu >= 0) {
if (static_branch_unlikely(&csdlock_debug_extended))
csd_lock_print_extended(csd, cpu);
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
arch_send_call_function_single_ipi(cpu);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a492f159624f..860b2dcf3ac4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self);
/* mm/fadvise.c */
COND_SYSCALL(fadvise64_64);
+COND_SYSCALL_COMPAT(fadvise64_64);
/* mm/, CONFIG_MMU only */
COND_SYSCALL(swapon);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bc921a3f7ea8..439e2ab6905e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1861,8 +1861,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
ftrace_hash_rec_update_modify(ops, filter_hash, 1);
}
-static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip);
-
/*
* Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
* or no-needed to update, -EBUSY if it detects a conflict of the flag
@@ -2974,6 +2972,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_startup_enable(command);
+ /*
+ * If ftrace is in an undefined state, we just remove ops from list
+ * to prevent the NULL pointer, instead of totally rolling it back and
+ * free trampoline, because those actions could cause further damage.
+ */
+ if (unlikely(ftrace_disabled)) {
+ __unregister_ftrace_function(ops);
+ return -ENODEV;
+ }
+
ops->flags &= ~FTRACE_OPS_FL_ADDING;
return 0;
@@ -3108,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
ftrace_hash_empty(ops->func_hash->notrace_hash);
}
-/*
- * Check if the current ops references the given ip.
- *
- * If the ops traces all functions, then it was already accounted for.
- * If the ops does not trace the current record function, skip it.
- * If the ops ignores the function via notrace filter, skip it.
- */
-static bool
-ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
-{
- /* If ops isn't enabled, ignore it */
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return false;
-
- /* If ops traces all then it includes this function */
- if (ops_traces_mod(ops))
- return true;
-
- /* The function must be in the filter */
- if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
- return false;
-
- /* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
- return false;
-
- return true;
-}
-
-/*
- * Check if the current ops references the record.
- *
- * If the ops traces all functions, then it was already accounted for.
- * If the ops does not trace the current record function, skip it.
- * If the ops ignores the function via notrace filter, skip it.
- */
-static bool
-ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
-{
- return ops_references_ip(ops, rec->ip);
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
bool init_nop = ftrace_need_init_nop();
@@ -6812,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum,
return -ERANGE;
}
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES)
+/*
+ * Check if the current ops references the given ip.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static bool
+ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return false;
+
+ /* If ops traces all then it includes this function */
+ if (ops_traces_mod(ops))
+ return true;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
+ return false;
+
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
+ return false;
+
+ return true;
+}
+#endif
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6824,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec)) {
+ if (ops_references_ip(ops, rec->ip)) {
if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
continue;
if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index c1c47e2305ef..dacc37b62a2c 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -27,7 +27,7 @@ struct automaton_wip {
bool final_states[state_max_wip];
};
-struct automaton_wip automaton_wip = {
+static struct automaton_wip automaton_wip = {
.state_names = {
"preemptive",
"non_preemptive"
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index d1afe55cdd4c..118e576b91b4 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -27,7 +27,7 @@ struct automaton_wwnr {
bool final_states[state_max_wwnr];
};
-struct automaton_wwnr automaton_wwnr = {
+static struct automaton_wwnr automaton_wwnr = {
.state_names = {
"not_running",
"running"
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index b698d05dd069..d65f6c25a87c 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -24,13 +24,13 @@ static struct rv_reactor rv_panic = {
.react = rv_panic_reaction
};
-static int register_react_panic(void)
+static int __init register_react_panic(void)
{
rv_register_reactor(&rv_panic);
return 0;
}
-static void unregister_react_panic(void)
+static void __exit unregister_react_panic(void)
{
rv_unregister_reactor(&rv_panic);
}
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 31899f953af4..4b6b7106a477 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -23,13 +23,13 @@ static struct rv_reactor rv_printk = {
.react = rv_printk_reaction
};
-static int register_react_printk(void)
+static int __init register_react_printk(void)
{
rv_register_reactor(&rv_printk);
return 0;
}
-static void unregister_react_printk(void)
+static void __exit unregister_react_printk(void)
{
rv_unregister_reactor(&rv_printk);
}
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 4a0e9d927443..1783e3478912 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -227,6 +227,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
struct probe_arg *parg = &ep->tp.args[i];
struct ftrace_event_field *field;
struct list_head *head;
+ int ret = -ENOENT;
head = trace_get_fields(ep->event);
list_for_each_entry(field, head, link) {
@@ -236,9 +237,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
return 0;
}
}
+
+ /*
+ * Argument not found on event. But allow for comm and COMM
+ * to be used to get the current->comm.
+ */
+ if (strcmp(parg->code->data, "COMM") == 0 ||
+ strcmp(parg->code->data, "comm") == 0) {
+ parg->code->op = FETCH_OP_COMM;
+ ret = 0;
+ }
+
kfree(parg->code->data);
parg->code->data = NULL;
- return -ENOENT;
+ return ret;
}
static int eprobe_event_define_fields(struct trace_event_call *event_call)
@@ -311,6 +323,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec)
addr = rec + field->offset;
+ if (is_string_field(field)) {
+ switch (field->filter_type) {
+ case FILTER_DYN_STRING:
+ val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_RDYN_STRING:
+ val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_STATIC_STRING:
+ val = (unsigned long)addr;
+ break;
+ case FILTER_PTR_STRING:
+ val = (unsigned long)(*(char *)addr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+ return val;
+ }
+
switch (field->size) {
case 1:
if (field->is_signed)
@@ -342,16 +375,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec)
static int get_eprobe_size(struct trace_probe *tp, void *rec)
{
+ struct fetch_insn *code;
struct probe_arg *arg;
int i, len, ret = 0;
for (i = 0; i < tp->nr_args; i++) {
arg = tp->args + i;
- if (unlikely(arg->dynamic)) {
+ if (arg->dynamic) {
unsigned long val;
- val = get_event_field(arg->code, rec);
- len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL);
+ code = arg->code;
+ retry:
+ switch (code->op) {
+ case FETCH_OP_TP_ARG:
+ val = get_event_field(code, rec);
+ break;
+ case FETCH_OP_IMM:
+ val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ continue;
+ }
+ code++;
+ len = process_fetch_insn_bottom(code, val, NULL, NULL);
if (len > 0)
ret += len;
}
@@ -369,8 +424,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
{
unsigned long val;
- val = get_event_field(code, rec);
- return process_fetch_insn_bottom(code + 1, val, dest, base);
+ retry:
+ switch (code->op) {
+ case FETCH_OP_TP_ARG:
+ val = get_event_field(code, rec);
+ break;
+ case FETCH_OP_IMM:
+ val = code->immediate;
+ break;
+ case FETCH_OP_COMM:
+ val = (unsigned long)current->comm;
+ break;
+ case FETCH_OP_DATA:
+ val = (unsigned long)code->data;
+ break;
+ case FETCH_NOP_SYMBOL: /* Ignore a place holder */
+ code++;
+ goto retry;
+ default:
+ return -EILSEQ;
+ }
+ code++;
+ return process_fetch_insn_bottom(code, val, dest, base);
}
NOKPROBE_SYMBOL(process_fetch_insn)
@@ -845,6 +920,10 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
trace_probe_log_err(0, BAD_ATTACH_ARG);
}
+ /* Handle symbols "@" */
+ if (!ret)
+ ret = traceprobe_update_arg(&ep->tp.args[i]);
+
return ret;
}
@@ -883,7 +962,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_set_index(1);
sys_event = argv[1];
ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
- if (!sys_event || !sys_name) {
+ if (ret || !sys_event || !sys_name) {
trace_probe_log_err(0, NO_EVENT_INFO);
goto parse_error;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a114549720d6..61e3a2620fa3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
int i;
if (--tp_event->perf_refcount > 0)
- goto out;
+ return;
tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
@@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
perf_trace_buf[i] = NULL;
}
}
-out:
- trace_event_put_ref(tp_event);
}
static int perf_trace_event_open(struct perf_event *p_event)
@@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
}
@@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
destroy_local_trace_kprobe(p_event->tp_event);
@@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event)
mutex_lock(&event_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+ trace_event_put_ref(p_event->tp_event);
mutex_unlock(&event_mutex);
destroy_local_trace_uprobe(p_event->tp_event);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 181f08186d32..0356cae0cf74 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -176,6 +176,7 @@ static int trace_define_generic_fields(void)
__generic_field(int, CPU, FILTER_CPU);
__generic_field(int, cpu, FILTER_CPU);
+ __generic_field(int, common_cpu, FILTER_CPU);
__generic_field(char *, COMM, FILTER_COMM);
__generic_field(char *, comm, FILTER_COMM);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cb866c3141af..918730d74932 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file)
{
struct event_trigger_data *data;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ list_for_each_entry_rcu(data, &file->triggers, list,
+ lockdep_is_held(&event_mutex)) {
if (data->flags & EVENT_TRIGGER_FL_PROBE)
continue;
return true;
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 95b58bd757ce..1e130da1b742 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
}
lockdep_hardirqs_on_prepare();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ lockdep_hardirqs_on(caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
- lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirqs_off(caller_addr);
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 850a88abd33b..36dff277de46 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -283,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
int ret = 0;
int len;
- if (strcmp(arg, "retval") == 0) {
+ if (flags & TPARG_FL_TPOINT) {
+ if (code->data)
+ return -EFAULT;
+ code->data = kstrdup(arg, GFP_KERNEL);
+ if (!code->data)
+ return -ENOMEM;
+ code->op = FETCH_OP_TP_ARG;
+ } else if (strcmp(arg, "retval") == 0) {
if (flags & TPARG_FL_RETURN) {
code->op = FETCH_OP_RETVAL;
} else {
@@ -307,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
goto inval_var;
- } else if (strcmp(arg, "comm") == 0) {
+ } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
code->op = FETCH_OP_COMM;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
} else if (((flags & TPARG_FL_MASK) ==
@@ -323,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
#endif
- } else if (flags & TPARG_FL_TPOINT) {
- if (code->data)
- return -EFAULT;
- code->data = kstrdup(arg, GFP_KERNEL);
- if (!code->data)
- return -ENOMEM;
- code->op = FETCH_OP_TP_ARG;
} else
goto inval_var;
@@ -384,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
break;
case '%': /* named register */
+ if (flags & TPARG_FL_TPOINT) {
+ /* eprobes do not handle registers */
+ trace_probe_log_err(offs, BAD_VAR);
+ break;
+ }
ret = regs_query_register_offset(arg + 1);
if (ret >= 0) {
code->op = FETCH_OP_REG;
@@ -617,9 +622,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
/*
* Since $comm and immediate string can not be dereferenced,
- * we can find those by strcmp.
+ * we can find those by strcmp. But ignore for eprobes.
*/
- if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
+ if (!(flags & TPARG_FL_TPOINT) &&
+ (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
+ strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
goto out;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 64ea283f2f86..ef42c1a11920 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -571,7 +571,8 @@ static void for_each_tracepoint_range(
bool trace_module_has_bad_taint(struct module *mod)
{
return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
- (1 << TAINT_UNSIGNED_MODULE));
+ (1 << TAINT_UNSIGNED_MODULE) |
+ (1 << TAINT_TEST));
}
static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
@@ -647,7 +648,7 @@ static int tracepoint_module_coming(struct module *mod)
/*
* We skip modules that taint the kernel, especially those with different
* module headers (for forced load), to make sure we don't cause a crash.
- * Staging, out-of-tree, and unsigned GPL modules are fine.
+ * Staging, out-of-tree, unsigned GPL, and test modules are fine.
*/
if (trace_module_has_bad_taint(mod))
return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 5481ba44a8d6..3f464bbda0e9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
+#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
@@ -113,6 +114,10 @@ int create_user_ns(struct cred *new)
!kgid_has_mapping(parent_ns, group))
goto fail_dec;
+ ret = security_create_user_ns(new);
+ if (ret < 0)
+ goto fail_dec;
+
ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeea9731ef80..7cd5f5e7e0a1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1651,7 +1651,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -3066,10 +3066,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!work->func))
return false;
- if (!from_cancel) {
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
- }
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
if (start_flush_work(work, &barr, from_cancel)) {
wait_for_completion(&barr.done);