aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec2
-rw-r--r--kernel/audit_tree.c4
-rw-r--r--kernel/cgroup/cgroup.c15
-rw-r--r--kernel/crash_core.c184
-rw-r--r--kernel/debug/debug_core.c3
-rw-r--r--kernel/debug/kdb/kdb_main.c7
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c26
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/irq/manage.c15
-rw-r--r--kernel/kcsan/kcsan_test.c9
-rw-r--r--kernel/kcsan/selftest.c9
-rw-r--r--kernel/kexec_core.c17
-rw-r--r--kernel/kthread.c19
-rw-r--r--kernel/livepatch/core.c2
-rw-r--r--kernel/module/Kconfig25
-rw-r--r--kernel/module/decompress.c8
-rw-r--r--kernel/module/stats.c2
-rw-r--r--kernel/module/sysfs.c2
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c22
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/internal.h31
-rw-r--r--kernel/printk/nbcon.c1029
-rw-r--r--kernel/printk/printk.c162
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/tasks.h32
-rw-r--r--kernel/rcu/tree.c64
-rw-r--r--kernel/rcu/tree_nocb.h19
-rw-r--r--kernel/reboot.c4
-rw-r--r--kernel/resource.c51
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/fair.c12
-rw-r--r--kernel/sched/wait.c60
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/smpboot.c3
-rw-r--r--kernel/sys.c73
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/trace/ring_buffer.c8
-rw-r--r--kernel/trace/trace.c90
-rw-r--r--kernel/trace/trace.h16
-rw-r--r--kernel/trace/trace_events.c362
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_events_hist.c11
-rw-r--r--kernel/trace/trace_events_synth.c2
-rw-r--r--kernel/trace/trace_events_user.c36
-rw-r--r--kernel/trace/trace_seq.c6
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/watchdog.c7
-rw-r--r--kernel/workqueue.c16
53 files changed, 1991 insertions, 541 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 9bfe68fe9676..7aff28ded2f4 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -110,7 +110,7 @@ config CRASH_DUMP
For more details see Documentation/admin-guide/kdump/kdump.rst
For s390, this option also enables zfcpdump.
- See also <file:Documentation/s390/zfcpdump.rst>
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 85a5b306733b..1b07e6f12a07 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -87,8 +87,8 @@ static struct task_struct *prune_thread;
* that makes a difference. Some.
*/
-static struct fsnotify_group *audit_tree_group;
-static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+static struct fsnotify_group *audit_tree_group __ro_after_init;
+static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 484adb375b15..1d5b9de3b1b9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1908,6 +1908,7 @@ enum cgroup2_param {
Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
+ Opt_memory_hugetlb_accounting,
nr__cgroup2_params
};
@@ -1916,6 +1917,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
{}
};
@@ -1942,6 +1944,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
+ case Opt_memory_hugetlb_accounting:
+ ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ return 0;
}
return -EINVAL;
}
@@ -1966,6 +1971,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+ if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
}
}
@@ -1979,6 +1989,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ seq_puts(seq, ",memory_hugetlb_accounting");
return 0;
}
@@ -7068,7 +7080,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"nsdelegate\n"
"favordynmods\n"
"memory_localevents\n"
- "memory_recursiveprot\n");
+ "memory_recursiveprot\n"
+ "memory_hugetlb_accounting\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 2f675ef045d4..efe87d501c8c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -5,7 +5,6 @@
*/
#include <linux/buildid.h>
-#include <linux/crash_core.h>
#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
@@ -13,6 +12,9 @@
#include <linux/kexec.h>
#include <linux/memory.h>
#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +35,22 @@ u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
static unsigned char *vmcoreinfo_data_safecopy;
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
/*
* parsing the "crashkernel" commandline
*
@@ -248,11 +266,11 @@ static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
- const char *name,
const char *suffix)
{
char *first_colon, *first_space;
char *ck_cmdline;
+ char *name = "crashkernel=";
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
@@ -283,32 +301,53 @@ static int __init __parse_crashkernel(char *cmdline,
/*
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
*/
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ bool *high)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
+ int ret;
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ return ret;
}
/*
@@ -321,6 +360,109 @@ static int __init parse_crashkernel_dummy(char *arg)
}
early_param("crashkernel", parse_crashkernel_dummy);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base > CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
+}
+#endif
+
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
{
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 621037a0aa87..ce1bb2301c06 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1006,6 +1006,9 @@ void kgdb_panic(const char *msg)
if (panic_timeout)
return;
+ debug_locks_off();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", msg);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 438b868cbfa9..6b213c8252d6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -272,11 +272,10 @@ char *kdbgetenv(const char *match)
* kdballocenv - This function is used to allocate bytes for
* environment entries.
* Parameters:
- * match A character string representing a numeric value
- * Outputs:
- * *value the unsigned long representation of the env variable 'match'
+ * bytes The number of bytes to allocate in the static buffer.
* Returns:
- * Zero on success, a kdb diagnostic on failure.
+ * A pointer to the allocated space in the buffer on success.
+ * NULL if bytes > size available in the envbuffer.
* Remarks:
* We use a static environment buffer (envbuffer) to hold the values
* of dynamically generated environment variables (see kdb_set). Buffer
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3048589e2e85..435aac1d8c27 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -474,8 +474,8 @@ retry:
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
- if (IS_ERR_OR_NULL(old_page))
- return old_page ? PTR_ERR(old_page) : 0;
+ if (IS_ERR(old_page))
+ return PTR_ERR(old_page);
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2b4a232f2f68..ee9f43bed49a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -135,7 +135,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
- list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
@@ -541,7 +540,6 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
@@ -831,9 +829,6 @@ void __noreturn do_exit(long code)
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 204bc1c33533..10917c3e1f03 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
get_file(file);
i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(tmp))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
@@ -1288,7 +1288,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
hugetlb_count_init(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->flags = mmf_init_flags(current->mm->flags);
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
mm->flags = default_dump_filter;
@@ -1393,6 +1393,8 @@ EXPORT_SYMBOL_GPL(mmput_async);
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
@@ -1432,6 +1434,8 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
@@ -1483,6 +1487,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
@@ -1499,6 +1504,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
/**
* get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@ -1521,6 +1527,7 @@ struct file *get_task_exe_file(struct task_struct *task)
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
@@ -2100,11 +2107,11 @@ const struct file_operations pidfd_fops = {
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the file for the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
-
+ *
* The helper doesn't perform checks on @pid which makes it useful for pidfds
* created via CLONE_PIDFD where @pid has no task attached when the pidfd and
* pidfd file are prepared.
@@ -2151,7 +2158,7 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
@@ -2404,10 +2411,6 @@ __latent_entropy struct task_struct *copy_process(
p->io_uring = NULL;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
@@ -2574,7 +2577,6 @@ __latent_entropy struct task_struct *copy_process(
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
clear_posix_cputimers_work(p);
@@ -2702,8 +2704,6 @@ __latent_entropy struct task_struct *copy_process(
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
@@ -3179,7 +3179,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
}
/**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 5c3086cad8f9..01520689b57c 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -99,7 +99,7 @@ struct gcov_iterator {
struct gcov_info *info;
size_t size;
loff_t pos;
- char buffer[];
+ char buffer[] __counted_by(size);
};
/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d309ba84e08a..1782f90cd8c6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1852,15 +1852,13 @@ out_thread:
struct task_struct *t = new->thread;
new->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
if (new->secondary && new->secondary->thread) {
struct task_struct *t = new->secondary->thread;
new->secondary->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
out_mput:
module_put(desc->owner);
@@ -1971,12 +1969,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* the same bit to a newly requested action.
*/
if (action->thread) {
- kthread_stop(action->thread);
- put_task_struct(action->thread);
- if (action->secondary && action->secondary->thread) {
- kthread_stop(action->secondary->thread);
- put_task_struct(action->secondary->thread);
- }
+ kthread_stop_put(action->thread);
+ if (action->secondary && action->secondary->thread)
+ kthread_stop_put(action->secondary->thread);
}
/* Last action releases resources */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 0ddbdab5903d..015586217875 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -699,12 +699,9 @@ static void test_barrier_nothreads(struct kunit *test)
KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
-
-#ifdef clear_bit_unlock_is_negative_byte
- KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
- KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
- KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
-#endif
+ KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
kcsan_nestable_atomic_end();
}
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 8679322450f2..84a1200271af 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -227,12 +227,9 @@ static bool __init test_barrier(void)
KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
spin_lock(&test_spinlock);
KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
-
-#ifdef clear_bit_unlock_is_negative_byte
- KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
- KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
- KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
-#endif
+ KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
kcsan_nestable_atomic_end();
return ret;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9dc728982d79..be5642a4ec49 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,23 +52,6 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-
int kexec_should_crash(struct task_struct *p)
{
/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1eea53050bab..c5e40830c1f2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -715,6 +715,24 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+ int ret;
+
+ ret = kthread_stop(k);
+ put_task_struct(k);
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
@@ -1469,7 +1487,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
- sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 61328328c474..ecbc9b6aba3a 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -243,7 +243,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
* symbols are exported and normal relas can be used instead.
*/
if (!sec_vmlinux && sym_vmlinux) {
- pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section",
+ pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n",
sym_name);
return -EINVAL;
}
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 33a2e991f608..0ea1b2970a23 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,14 +236,6 @@ choice
possible to load a signed module containing the algorithm to check
the signature on that module.
-config MODULE_SIG_SHA1
- bool "Sign modules with SHA-1"
- select CRYPTO_SHA1
-
-config MODULE_SIG_SHA224
- bool "Sign modules with SHA-224"
- select CRYPTO_SHA256
-
config MODULE_SIG_SHA256
bool "Sign modules with SHA-256"
select CRYPTO_SHA256
@@ -256,16 +248,29 @@ config MODULE_SIG_SHA512
bool "Sign modules with SHA-512"
select CRYPTO_SHA512
+config MODULE_SIG_SHA3_256
+ bool "Sign modules with SHA3-256"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_384
+ bool "Sign modules with SHA3-384"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_512
+ bool "Sign modules with SHA3-512"
+ select CRYPTO_SHA3
+
endchoice
config MODULE_SIG_HASH
string
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
- default "sha1" if MODULE_SIG_SHA1
- default "sha224" if MODULE_SIG_SHA224
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
+ default "sha3-256" if MODULE_SIG_SHA3_256
+ default "sha3-384" if MODULE_SIG_SHA3_384
+ default "sha3-512" if MODULE_SIG_SHA3_512
choice
prompt "Module compression mode"
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 87440f714c0c..474e68f0f063 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -100,7 +100,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
s.next_in = buf + gzip_hdr_len;
s.avail_in = size - gzip_hdr_len;
- s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+ s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
if (!s.workspace)
return -ENOMEM;
@@ -138,7 +138,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
out_inflate_end:
zlib_inflateEnd(&s);
out:
- kfree(s.workspace);
+ kvfree(s.workspace);
return retval;
}
#elif defined(CONFIG_MODULE_COMPRESS_XZ)
@@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
}
wksp_size = zstd_dstream_workspace_bound(header.windowSize);
- wksp = vmalloc(wksp_size);
+ wksp = kvmalloc(wksp_size, GFP_KERNEL);
if (!wksp) {
retval = -ENOMEM;
goto out;
@@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
retval = new_size;
out:
- vfree(wksp);
+ kvfree(wksp);
return retval;
}
#else
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
index 6ab2c94d6bc3..3ba0e98b3c91 100644
--- a/kernel/module/stats.c
+++ b/kernel/module/stats.c
@@ -126,7 +126,7 @@ static LIST_HEAD(dup_failed_modules);
* These typically should not happen unless your system is under memory
* pressure.
* * invalid_becoming_bytes: total number of bytes allocated and freed used
- * used to read the kernel module userspace wants us to read before we
+ * to read the kernel module userspace wants us to read before we
* promote it to be processed to be added to our @modules linked list. These
* failures can happen if we had a check in between a successful kernel_read_file_from_fd()
* call and right before we allocate the our private memory for the module
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index c921bf044050..d964167c6658 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -143,7 +143,7 @@ static void remove_sect_attrs(struct module *mod)
struct module_notes_attrs {
struct kobject *dir;
unsigned int notes;
- struct bin_attribute attrs[];
+ struct bin_attribute attrs[] __counted_by(notes);
};
static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
diff --git a/kernel/padata.c b/kernel/padata.c
index 222d60195de6..179fb1518070 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -202,7 +202,7 @@ int padata_do_parallel(struct padata_shell *ps,
*cb_cpu = cpu;
}
- err = -EBUSY;
+ err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
goto out;
@@ -1102,12 +1102,16 @@ EXPORT_SYMBOL(padata_alloc_shell);
*/
void padata_free_shell(struct padata_shell *ps)
{
+ struct parallel_data *pd;
+
if (!ps)
return;
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
- padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ pd = rcu_dereference_protected(ps->pd, 1);
+ if (refcount_dec_and_test(&pd->refcnt))
+ padata_free_pd(pd);
mutex_unlock(&ps->pinst->lock);
kfree(ps);
diff --git a/kernel/panic.c b/kernel/panic.c
index ffa037fa777d..2807639aab51 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -192,14 +192,15 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, cpu;
+ int old_cpu, this_cpu;
- cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
- if (old_cpu == PANIC_CPU_INVALID)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
panic("%s", msg);
- else if (old_cpu != cpu)
+ else if (old_cpu != this_cpu)
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
@@ -311,15 +312,18 @@ void panic(const char *fmt, ...)
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*
- * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
- * comes here, so go ahead.
+ * cmpxchg success means this is the 1st CPU which comes here,
+ * so go ahead.
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
+ old_cpu = PANIC_CPU_INVALID;
this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ /* go ahead */
+ } else if (old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 619972c78774..3028b2218aa4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -286,12 +286,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
- /*
- * Writing directly to ns' last_pid field is OK, since this field
- * is volatile in a living namespace anyway and a code writing to
- * it should synchronize its usage with external means.
- */
-
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index f5b388e810b9..39a2b61c7232 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK_INDEX) += index.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7d4979d5c3ce..6c2afee5ef62 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,6 +3,8 @@
* internal.h - printk internal definitions
*/
#include <linux/percpu.h>
+#include <linux/console.h>
+#include "printk_ringbuffer.h"
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
void __init printk_sysctl_init(void);
@@ -12,6 +14,12 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
#define printk_sysctl_init() do { } while (0)
#endif
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NBCON) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+
#ifdef CONFIG_PRINTK
#ifdef CONFIG_PRINTK_CALLER
@@ -35,6 +43,8 @@ enum printk_info_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
+extern struct printk_ringbuffer *prb;
+
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -61,6 +71,13 @@ void defer_console_output(void);
u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
+
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
+bool nbcon_alloc(struct console *con);
+void nbcon_init(struct console *con);
+void nbcon_free(struct console *con);
+
#else
#define PRINTK_PREFIX_MAX 0
@@ -76,8 +93,16 @@ u16 printk_parse_prefix(const char *text, int *level,
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
static inline bool printk_percpu_data_ready(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
+static inline bool nbcon_alloc(struct console *con) { return false; }
+static inline void nbcon_init(struct console *con) { }
+static inline void nbcon_free(struct console *con) { }
+
#endif /* CONFIG_PRINTK */
+extern struct printk_buffers printk_shared_pbufs;
+
/**
* struct printk_buffers - Buffers to read/format/output printk messages.
* @outbuf: After formatting, contains text to output.
@@ -105,3 +130,9 @@ struct printk_message {
};
bool other_cpu_in_panic(void);
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..b96077152f49
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,1029 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include "internal.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ * - The 'prio' field contains the priority of the context that owns the
+ * console. Only higher priority contexts are allowed to take over the
+ * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ * - The 'cpu' field denotes on which CPU the console is locked. It is used
+ * to prevent busy waiting on the same CPU. Also it informs the lock owner
+ * that it has lost the lock in a more complex scenario when the lock was
+ * taken over by a higher priority context, released, and taken on another
+ * CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ * - The 'req_prio' field is used by the handover approach to make the
+ * current owner aware that there is a context with a higher priority
+ * waiting for the friendly handover.
+ *
+ * - The 'unsafe' field allows to take over the console in a safe way in the
+ * middle of emitting a message. The field is set only when accessing some
+ * shared resources or when the console device is manipulated. It can be
+ * cleared, for example, after emitting one character when the console
+ * device is in a consistent state.
+ *
+ * - The 'unsafe_takeover' field is set when a hostile takeover took the
+ * console in an unsafe state. The console will stay in the unsafe state
+ * until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ * 1) Direct acquire when the console is not owned or is owned by a lower
+ * priority context and is in a safe state.
+ *
+ * 2) Friendly handover mechanism uses a request/grant handshake. It is used
+ * when the current owner has lower priority and the console is in an
+ * unsafe state.
+ *
+ * The requesting context:
+ *
+ * a) Sets its priority into the 'req_prio' field.
+ *
+ * b) Waits (with a timeout) for the owning context to unlock the
+ * console.
+ *
+ * c) Takes the lock and clears the 'req_prio' field.
+ *
+ * The owning context:
+ *
+ * a) Observes the 'req_prio' field set on exit from the unsafe
+ * console state.
+ *
+ * b) Gives up console ownership by clearing the 'prio' field.
+ *
+ * 3) Unsafe hostile takeover allows to take over the lock even when the
+ * console is an unsafe state. It is used only in panic() by the final
+ * attempt to flush consoles in a try and hope mode.
+ *
+ * Note that separate record buffers are used in panic(). As a result,
+ * the messages can be read and formatted without any risk even after
+ * using the hostile takeover in unsafe state.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ * - Preference for higher priority contexts.
+ * - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ * - What is marked as an unsafe section.
+ * - Whether to spin-wait if there is already an owner and the console is
+ * in an unsafe state.
+ * - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ * acquire()
+ * output_one_printk_record()
+ * release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
+ */
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+ atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con: Console to read
+ * @state: The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+ state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con: Console to update
+ * @cur: Old/expected state
+ * @new: New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+ struct nbcon_state *new)
+{
+ return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+#ifdef CONFIG_64BIT
+
+#define __seq_to_nbcon_seq(seq) (seq)
+#define __nbcon_seq_to_seq(seq) (seq)
+
+#else /* CONFIG_64BIT */
+
+#define __seq_to_nbcon_seq(seq) ((u32)seq)
+
+static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
+{
+ u64 seq;
+ u64 rb_next_seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the next sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ rb_next_seq = prb_next_seq(prb);
+ seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con: Console to read the sequence of
+ *
+ * Return: Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+ unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+ return __nbcon_seq_to_seq(nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con: Console to work on
+ * @seq: Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+ /*
+ * If the specified record no longer exists, the oldest available record
+ * is chosen. This is especially important on 32bit systems because only
+ * the lower 32 bits of the sequence number are stored. The upper 32 bits
+ * are derived from the sequence numbers available in the ringbuffer.
+ */
+ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+
+ /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
+ con->seq = 0;
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ * @new_seq: The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+ unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+ struct console *con = ctxt->console;
+
+ if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+ __seq_to_nbcon_seq(new_seq))) {
+ ctxt->seq = new_seq;
+ } else {
+ ctxt->seq = nbcon_seq_read(con);
+ }
+}
+
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or the current owner or waiter has the same or higher
+ * priority. No acquire method can be successful in
+ * this case.
+ *
+ * -EBUSY: The current owner has a lower priority but the console
+ * in an unsafe state. The caller should try using
+ * the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ do {
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+ return -EPERM;
+
+ if (cur->unsafe)
+ return -EBUSY;
+
+ /*
+ * The console should never be safe for a direct acquire
+ * if an unsafe hostile takeover has ever happened.
+ */
+ WARN_ON_ONCE(cur->unsafe_takeover);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+ /*
+ * The request context is well defined by the @req_prio because:
+ *
+ * - Only a context with a higher priority can take over the request.
+ * - There are only three priorities.
+ * - Only one CPU is allowed to request PANIC priority.
+ * - Lower priorities are ignored during panic() until reboot.
+ *
+ * As a result, the following scenario is *not* possible:
+ *
+ * 1. Another context with a higher priority directly takes ownership.
+ * 2. The higher priority context releases the ownership.
+ * 3. A lower priority context takes the ownership.
+ * 4. Another context with the same priority as this context
+ * creates a request and starts waiting.
+ */
+
+ return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ * requested a handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return: 0 on success and @cur is updated to the new console state.
+ * Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU
+ * or this context is no longer the waiter.
+ *
+ * -EBUSY: The console is still locked. The caller should
+ * continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ * except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ /* Note that the caller must still remove the request! */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ /*
+ * Note that the waiter will also change if there was an unsafe
+ * hostile takeover.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* If still locked, caller should continue waiting. */
+ if (cur->prio != NBCON_PRIO_NONE)
+ return -EBUSY;
+
+ /*
+ * The previous owner should have never released ownership
+ * in an unsafe region.
+ */
+ WARN_ON_ONCE(cur->unsafe);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * The acquire could fail only when it has been taken
+ * over by a higher priority context.
+ */
+ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+ return -EPERM;
+ }
+
+ /* Handover success. This context now owns the console. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or a higher priority context has taken over the
+ * console or the handover request.
+ *
+ * -EBUSY: The current owner is on the same CPU so that the hand
+ * shake could not work. Or the current owner is not
+ * willing to wait (zero timeout). Or the console does
+ * not enter the safe state before timeout passed. The
+ * caller might still use the unsafe hostile takeover
+ * when allowed.
+ *
+ * -EAGAIN: @cur has changed when creating the handover request.
+ * The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+ int timeout;
+ int request_err = -EBUSY;
+
+ /*
+ * Check that the handover is called when the direct acquire failed
+ * with -EBUSY.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(!cur->unsafe);
+
+ /* Handover is not possible on the same CPU. */
+ if (cur->cpu == cpu)
+ return -EBUSY;
+
+ /*
+ * Console stays unsafe after an unsafe takeover until re-initialized.
+ * Waiting is not going to help in this case.
+ */
+ if (cur->unsafe_takeover)
+ return -EBUSY;
+
+ /* Is the caller willing to wait? */
+ if (ctxt->spinwait_max_us == 0)
+ return -EBUSY;
+
+ /*
+ * Setup a request for the handover. The caller should try to acquire
+ * the console directly when the current state has been modified.
+ */
+ new.atom = cur->atom;
+ new.req_prio = ctxt->prio;
+ if (!nbcon_state_try_cmpxchg(con, cur, &new))
+ return -EAGAIN;
+
+ cur->atom = new.atom;
+
+ /* Wait until there is no owner and then acquire the console. */
+ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+ /* On successful acquire, this request is cleared. */
+ request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+ if (!request_err)
+ return 0;
+
+ /*
+ * If the acquire should be aborted, it must be ensured
+ * that the request is removed before returning to caller.
+ */
+ if (request_err == -EPERM)
+ break;
+
+ udelay(1);
+
+ /* Re-read the state because some time has passed. */
+ nbcon_state_read(con, cur);
+ }
+
+ /* Timed out or aborted. Carefully remove handover request. */
+ do {
+ /*
+ * No need to remove request if there is a new waiter. This
+ * can only happen if a higher priority context has taken over
+ * the console or the handover request.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* Unset request for handover. */
+ new.atom = cur->atom;
+ new.req_prio = NBCON_PRIO_NONE;
+ if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * Request successfully unset. Report failure of
+ * acquiring via handover.
+ */
+ cur->atom = new.atom;
+ return request_err;
+ }
+
+ /*
+ * Unable to remove request. Try to acquire in case
+ * the owner has released the lock.
+ */
+ } while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+ /* Lucky timing. The acquire succeeded while removing the request. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return: 0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ if (!ctxt->allow_unsafe_takeover)
+ return -EPERM;
+
+ /* Ensure caller is allowed to perform unsafe hostile takeovers. */
+ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+ return -EPERM;
+
+ /*
+ * Check that try_acquire_direct() and try_acquire_handover() returned
+ * -EBUSY in the right situation.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(cur->unsafe != true);
+
+ do {
+ new.atom = cur->atom;
+ new.cpu = cpu;
+ new.prio = ctxt->prio;
+ new.unsafe |= cur->unsafe_takeover;
+ new.unsafe_takeover |= cur->unsafe;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static struct printk_buffers panic_nbcon_pbufs;
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt: The context of the caller
+ *
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+__maybe_unused
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ int err;
+
+ nbcon_state_read(con, &cur);
+try_again:
+ err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_handover(ctxt, &cur);
+ if (err == -EAGAIN)
+ goto try_again;
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+ if (err)
+ return false;
+
+ /* Acquire succeeded. */
+
+ /* Assign the appropriate buffer for this context. */
+ if (atomic_read(&panic_cpu) == cpu)
+ ctxt->pbufs = &panic_nbcon_pbufs;
+ else
+ ctxt->pbufs = con->pbufs;
+
+ /* Set the record sequence for this context to print. */
+ ctxt->seq = nbcon_seq_read(ctxt->console);
+
+ return true;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+ int expected_prio)
+{
+ /*
+ * Since consoles can only be acquired by higher priorities,
+ * owning contexts are uniquely identified by @prio. However,
+ * since contexts can unexpectedly lose ownership, it is
+ * possible that later another owner appears with the same
+ * priority. For this reason @cpu is also needed.
+ */
+
+ if (cur->prio != expected_prio)
+ return false;
+
+ if (cur->cpu != expected_cpu)
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ */
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+ break;
+
+ new.atom = cur.atom;
+ new.prio = NBCON_PRIO_NONE;
+
+ /*
+ * If @unsafe_takeover is set, it is kept set so that
+ * the state remains permanently unsafe.
+ */
+ new.unsafe |= cur.unsafe_takeover;
+
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_context_can_proceed - Check whether ownership can proceed
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @cur: The current console state
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * Must be invoked when entering the unsafe state to make sure that it still
+ * owns the lock. Also must be invoked when exiting the unsafe context
+ * to eventually free the lock for a higher priority context which asked
+ * for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe
+ * state.
+ *
+ * Also it can be called in the safe context before doing an expensive
+ * safe operation. It does not make sense to do the operation when
+ * a higher priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+
+ /* Make sure this context still owns the console. */
+ if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
+ return false;
+
+ /* The console owner can proceed if there is no waiter. */
+ if (cur->req_prio == NBCON_PRIO_NONE)
+ return true;
+
+ /*
+ * A console owner within an unsafe region is always allowed to
+ * proceed, even if there are waiters. It can perform a handover
+ * when exiting the unsafe region. Otherwise the waiter will
+ * need to perform an unsafe hostile takeover.
+ */
+ if (cur->unsafe)
+ return true;
+
+ /* Waiters always have higher priorities than owners. */
+ WARN_ON_ONCE(cur->req_prio <= cur->prio);
+
+ /*
+ * Having a safe point for take over and eventually a few
+ * duplicated characters or a full line is way better than a
+ * hostile takeover. Post processing can take care of the garbage.
+ * Release and hand over.
+ */
+ nbcon_context_release(ctxt);
+
+ /*
+ * It is not clear whether the waiter really took over ownership. The
+ * outermost callsite must make the final decision whether console
+ * ownership is needed for it to proceed. If yes, it must reacquire
+ * ownership (possibly hostile) before carefully proceeding.
+ *
+ * The calling context no longer owns the console so go back all the
+ * way instead of trying to implement reacquire heuristics in tons of
+ * places.
+ */
+ return false;
+}
+
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ nbcon_state_read(con, &cur);
+
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
+#define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true)
+#define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false)
+
+/**
+ * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @unsafe: The new value for the unsafe bit
+ *
+ * Return: True if the unsafe state was updated and this context still
+ * owns the console. Otherwise false if ownership was handed
+ * over or taken.
+ *
+ * This function allows console owners to modify the unsafe status of the
+ * console.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ *
+ * Internal helper to avoid duplicated code.
+ */
+static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
+{
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ /*
+ * The unsafe bit must not be cleared if an
+ * unsafe hostile takeover has occurred.
+ */
+ if (!unsafe && cur.unsafe_takeover)
+ goto out;
+
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ new.atom = cur.atom;
+ new.unsafe = unsafe;
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ cur.atom = new.atom;
+out:
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return nbcon_context_enter_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt: The write context that will be handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+__maybe_unused
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ struct printk_message pmsg = {
+ .pbufs = ctxt->pbufs,
+ };
+ unsigned long con_dropped;
+ struct nbcon_state cur;
+ unsigned long dropped;
+ bool done;
+
+ /*
+ * The printk buffers are filled within an unsafe section. This
+ * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+ * clobbering each other.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+ if (!ctxt->backlog)
+ return nbcon_context_exit_unsafe(ctxt);
+
+ /*
+ * @con->dropped is not protected in case of an unsafe hostile
+ * takeover. In that situation the update can be racy so
+ * annotate it accordingly.
+ */
+ con_dropped = data_race(READ_ONCE(con->dropped));
+
+ dropped = con_dropped + pmsg.dropped;
+ if (dropped && !is_extended)
+ console_prepend_dropped(&pmsg, dropped);
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return false;
+
+ /* For skipped records just update seq/dropped in @con. */
+ if (pmsg.outbuf_len == 0)
+ goto update_con;
+
+ /* Initialize the write context for driver callbacks. */
+ wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+ wctxt->len = pmsg.outbuf_len;
+ nbcon_state_read(con, &cur);
+ wctxt->unsafe_takeover = cur.unsafe_takeover;
+
+ if (con->write_atomic) {
+ done = con->write_atomic(con, wctxt);
+ } else {
+ nbcon_context_release(ctxt);
+ WARN_ON_ONCE(1);
+ done = false;
+ }
+
+ /* If not done, the emit was aborted. */
+ if (!done)
+ return false;
+
+ /*
+ * Since any dropped message was successfully output, reset the
+ * dropped count for the console.
+ */
+ dropped = 0;
+update_con:
+ /*
+ * The dropped count and the sequence number are updated within an
+ * unsafe section. This limits update races to the panic context and
+ * allows the panic context to win.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ if (dropped != con_dropped) {
+ /* Counterpart to the READ_ONCE() above. */
+ WRITE_ONCE(con->dropped, dropped);
+ }
+
+ nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+
+/**
+ * nbcon_alloc - Allocate buffers needed by the nbcon console
+ * @con: Console to allocate buffers for
+ *
+ * Return: True on success. False otherwise and the console cannot
+ * be used.
+ *
+ * This is not part of nbcon_init() because buffer allocation must
+ * be performed earlier in the console registration process.
+ */
+bool nbcon_alloc(struct console *con)
+{
+ if (con->flags & CON_BOOT) {
+ /*
+ * Boot console printing is synchronized with legacy console
+ * printing, so boot consoles can share the same global printk
+ * buffers.
+ */
+ con->pbufs = &printk_shared_pbufs;
+ } else {
+ con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+ if (!con->pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * nbcon_init - Initialize the nbcon console specific data
+ * @con: Console to initialize
+ *
+ * nbcon_alloc() *must* be called and succeed before this function
+ * is called.
+ *
+ * This function expects that the legacy @con->seq has been set.
+ */
+void nbcon_init(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ /* nbcon_alloc() must have been called and successful! */
+ BUG_ON(!con->pbufs);
+
+ nbcon_seq_force(con, con->seq);
+ nbcon_state_set(con, &state);
+}
+
+/**
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con: Console to free/cleanup nbcon data
+ */
+void nbcon_free(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ nbcon_state_set(con, &state);
+
+ /* Boot consoles share global printk buffers. */
+ if (!(con->flags & CON_BOOT))
+ kfree(con->pbufs);
+
+ con->pbufs = NULL;
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 0b3af1529778..f2444b581e16 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -102,12 +102,6 @@ DEFINE_STATIC_SRCU(console_srcu);
*/
int __read_mostly suppress_printk;
-/*
- * During panic, heavy printk by other CPUs can delay the
- * panic and risk deadlock on console resources.
- */
-static int __read_mostly suppress_panic_printk;
-
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
@@ -445,6 +439,12 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
static DEFINE_MUTEX(syslog_lock);
#ifdef CONFIG_PRINTK
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+static int __read_mostly suppress_panic_printk;
+
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -494,7 +494,7 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
static struct printk_ringbuffer printk_rb_dynamic;
-static struct printk_ringbuffer *prb = &printk_rb_static;
+struct printk_ringbuffer *prb = &printk_rb_static;
/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
@@ -698,9 +698,6 @@ out:
return len;
}
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
- bool is_extended, bool may_supress);
-
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
@@ -1669,7 +1666,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
- len = 0;
prb_for_each_record(seq, prb, seq, &r) {
int textlen;
@@ -2349,22 +2345,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
static u64 syslog_seq;
-static size_t record_print_text(const struct printk_record *r,
- bool syslog, bool time)
-{
- return 0;
-}
-static ssize_t info_print_ext_header(char *buf, size_t size,
- struct printk_info *info)
-{
- return 0;
-}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *text, size_t text_len,
- struct dev_printk_info *dev_info) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
@@ -2404,13 +2384,21 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
console_set_on_cmdline = 1;
}
-static int __add_preferred_console(char *name, int idx, char *options,
+static int __add_preferred_console(const char *name, const short idx, char *options,
char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
/*
+ * We use a signed short index for struct console for device drivers to
+ * indicate a not yet assigned index or port. However, a negative index
+ * value is not valid for preferred console.
+ */
+ if (idx < 0)
+ return -EINVAL;
+
+ /*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
@@ -2513,7 +2501,7 @@ __setup("console=", console_setup);
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
{
return __add_preferred_console(name, idx, options, NULL, false);
}
@@ -2718,6 +2706,8 @@ static void __console_unlock(void)
up_console_sem();
}
+#ifdef CONFIG_PRINTK
+
/*
* Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
* is achieved by shifting the existing message over and inserting the dropped
@@ -2732,8 +2722,7 @@ static void __console_unlock(void)
*
* If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
-#ifdef CONFIG_PRINTK
-static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
struct printk_buffers *pbufs = pmsg->pbufs;
const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
@@ -2764,9 +2753,6 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
memcpy(outbuf, scratchbuf, len);
pmsg->outbuf_len += len;
}
-#else
-#define console_prepend_dropped(pmsg, dropped)
-#endif /* CONFIG_PRINTK */
/*
* Read and format the specified record (or a later record if the specified
@@ -2787,8 +2773,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
* of @pmsg are valid. (See the documentation of struct printk_message
* for information about the @pmsg fields.)
*/
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
- bool is_extended, bool may_suppress)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
{
static int panic_console_dropped;
@@ -2847,6 +2833,13 @@ out:
}
/*
+ * Used as the printk buffers for non-panic, serialized console printing.
+ * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
+ * Its usage requires the console_lock held.
+ */
+struct printk_buffers printk_shared_pbufs;
+
+/*
* Print one record for the given console. The record printed is whatever
* record is the next available record for the given console.
*
@@ -2863,12 +2856,10 @@ out:
*/
static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
- static struct printk_buffers pbufs;
-
bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
- char *outbuf = &pbufs.outbuf[0];
+ char *outbuf = &printk_shared_pbufs.outbuf[0];
struct printk_message pmsg = {
- .pbufs = &pbufs,
+ .pbufs = &printk_shared_pbufs,
};
unsigned long flags;
@@ -2919,6 +2910,16 @@ skip:
return true;
}
+#else
+
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ *handover = false;
+ return false;
+}
+
+#endif /* CONFIG_PRINTK */
+
/*
* Print out all remaining records to all consoles.
*
@@ -3163,6 +3164,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL) {
struct console *c;
+ short flags;
int cookie;
u64 seq;
@@ -3170,11 +3172,17 @@ void console_flush_on_panic(enum con_flush_mode mode)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- /*
- * This is an unsynchronized assignment, but the
- * kernel is in "hope and pray" mode anyway.
- */
- c->seq = seq;
+ flags = console_srcu_read_flags(c);
+
+ if (flags & CON_NBCON) {
+ nbcon_seq_force(c, seq);
+ } else {
+ /*
+ * This is an unsynchronized assignment. On
+ * panic legacy consoles are only best effort.
+ */
+ c->seq = seq;
+ }
}
console_srcu_read_unlock(cookie);
}
@@ -3326,11 +3334,6 @@ static void try_enable_default_console(struct console *newcon)
newcon->flags |= CON_CONSDEV;
}
-#define con_printk(lvl, con, fmt, ...) \
- printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \
- (con->flags & CON_BOOT) ? "boot" : "", \
- con->name, con->index, ##__VA_ARGS__)
-
static void console_init_seq(struct console *newcon, bool bootcon_registered)
{
struct console *con;
@@ -3444,6 +3447,15 @@ void register_console(struct console *newcon)
goto unlock;
}
+ if (newcon->flags & CON_NBCON) {
+ /*
+ * Ensure the nbcon console buffers can be allocated
+ * before modifying any global data.
+ */
+ if (!nbcon_alloc(newcon))
+ goto unlock;
+ }
+
/*
* See if we want to enable this console driver by default.
*
@@ -3471,8 +3483,11 @@ void register_console(struct console *newcon)
err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
- if (err || newcon->flags & CON_BRL)
+ if (err || newcon->flags & CON_BRL) {
+ if (newcon->flags & CON_NBCON)
+ nbcon_free(newcon);
goto unlock;
+ }
/*
* If we have a bootconsole, and are switching to a real console,
@@ -3488,6 +3503,9 @@ void register_console(struct console *newcon)
newcon->dropped = 0;
console_init_seq(newcon, bootcon_registered);
+ if (newcon->flags & CON_NBCON)
+ nbcon_init(newcon);
+
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
@@ -3579,6 +3597,9 @@ static int unregister_console_locked(struct console *console)
*/
synchronize_srcu(&console_srcu);
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
+
console_sysfs_notify();
if (console->exit)
@@ -3728,10 +3749,12 @@ late_initcall(printk_late_init);
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
{
- int remaining = timeout_ms;
+ unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
+ unsigned long remaining_jiffies = timeout_jiffies;
struct console *c;
u64 last_diff = 0;
u64 printk_seq;
+ short flags;
int cookie;
u64 diff;
u64 seq;
@@ -3745,6 +3768,9 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
console_unlock();
for (;;) {
+ unsigned long begin_jiffies;
+ unsigned long slept_jiffies;
+
diff = 0;
/*
@@ -3759,6 +3785,9 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for_each_console_srcu(c) {
if (con && con != c)
continue;
+
+ flags = console_srcu_read_flags(c);
+
/*
* If consoles are not usable, it cannot be expected
* that they make forward progress, so only increment
@@ -3766,31 +3795,33 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
*/
if (!console_is_usable(c))
continue;
- printk_seq = c->seq;
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(c);
+ } else {
+ printk_seq = c->seq;
+ }
+
if (printk_seq < seq)
diff += seq - printk_seq;
}
console_srcu_read_unlock(cookie);
if (diff != last_diff && reset_on_progress)
- remaining = timeout_ms;
+ remaining_jiffies = timeout_jiffies;
console_unlock();
/* Note: @diff is 0 if there are no usable consoles. */
- if (diff == 0 || remaining == 0)
+ if (diff == 0 || remaining_jiffies == 0)
break;
- if (remaining < 0) {
- /* no timeout limit */
- msleep(100);
- } else if (remaining < 100) {
- msleep(remaining);
- remaining = 0;
- } else {
- msleep(100);
- remaining -= 100;
- }
+ /* msleep(1) might sleep much longer. Check time by jiffies. */
+ begin_jiffies = jiffies;
+ msleep(1);
+ slept_jiffies = jiffies - begin_jiffies;
+
+ remaining_jiffies -= min(slept_jiffies, remaining_jiffies);
last_diff = diff;
}
@@ -4194,7 +4225,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
prb_rec_init_rd(&r, &info, buf, size);
- len = 0;
prb_for_each_record(seq, prb, seq, &r) {
if (r.info->seq >= iter->next_seq)
break;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..d8b5e13a2229 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
return 0;
}
- ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
+ ret = access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d866eaa4cc8..b531c33e9545 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -500,6 +500,7 @@ static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
static inline void rcu_async_hurry(void) { }
static inline void rcu_async_relax(void) { }
+static inline bool rcu_cpu_online(int cpu) { return true; }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -509,6 +510,7 @@ void rcu_unexpedite_gp(void);
void rcu_async_hurry(void);
void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
+bool rcu_cpu_online(int cpu);
#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 1fa631168594..f54d5782eca0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -895,10 +895,36 @@ static void rcu_tasks_pregp_step(struct list_head *hop)
synchronize_rcu();
}
+/* Check for quiescent states since the pregp's synchronize_rcu() */
+static bool rcu_tasks_is_holdout(struct task_struct *t)
+{
+ int cpu;
+
+ /* Has the task been seen voluntarily sleeping? */
+ if (!READ_ONCE(t->on_rq))
+ return false;
+
+ /*
+ * Idle tasks (or idle injection) within the idle loop are RCU-tasks
+ * quiescent states. But CPU boot code performed by the idle task
+ * isn't a quiescent state.
+ */
+ if (is_idle_task(t))
+ return false;
+
+ cpu = task_cpu(t);
+
+ /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */
+ if (t == idle_task(cpu) && !rcu_cpu_online(cpu))
+ return false;
+
+ return true;
+}
+
/* Per-task initial processing. */
static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
{
- if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
+ if (t != current && rcu_tasks_is_holdout(t)) {
get_task_struct(t);
t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
WRITE_ONCE(t->rcu_tasks_holdout, true);
@@ -947,7 +973,7 @@ static void check_holdout_task(struct task_struct *t,
if (!READ_ONCE(t->rcu_tasks_holdout) ||
t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
- !READ_ONCE(t->on_rq) ||
+ !rcu_tasks_is_holdout(t) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
!is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
WRITE_ONCE(t->rcu_tasks_holdout, false);
@@ -1525,7 +1551,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
} else {
// The task is not running, so C-language access is safe.
nesting = t->trc_reader_nesting;
- WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+ WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
n_heavy_reader_ofl_updates++;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 700524726079..3ac3c846105f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -755,14 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
+ int ret = 0;
struct rcu_node *rnp = rdp->mynode;
/*
@@ -848,8 +853,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
rcu_state.cbovld)) {
WRITE_ONCE(rdp->rcu_urgent_qs, true);
- resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
/*
@@ -862,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (time_after(jiffies, rcu_state.jiffies_resched)) {
if (time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
- resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
@@ -892,7 +897,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
}
- return 0;
+ return ret;
}
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
@@ -2271,15 +2276,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
int cpu;
unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_state.cbovld = rcu_state.cbovldnext;
rcu_state.cbovldnext = false;
rcu_for_each_leaf_node(rnp) {
+ unsigned long mask = 0;
+ unsigned long rsmask = 0;
+
cond_resched_tasks_rcu_qs();
- mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
@@ -2297,11 +2302,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
continue;
}
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ struct rcu_data *rdp;
+ int ret;
+
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (f(rdp)) {
+ ret = f(rdp);
+ if (ret > 0) {
mask |= rdp->grpmask;
rcu_disable_urgency_upon_qs(rdp);
}
+ if (ret < 0)
+ rsmask |= rdp->grpmask;
}
if (mask != 0) {
/* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2310,6 +2321,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
/* Nothing to do here, so just drop the lock. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+
+ for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+ resched_cpu(cpu);
}
}
@@ -3471,13 +3485,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
return freed == 0 ? SHRINK_STOP : freed;
}
-static struct shrinker kfree_rcu_shrinker = {
- .count_objects = kfree_rcu_shrink_count,
- .scan_objects = kfree_rcu_shrink_scan,
- .batch = 0,
- .seeks = DEFAULT_SEEKS,
-};
-
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
@@ -4202,6 +4209,13 @@ static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
}
+bool rcu_cpu_online(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_rdp_cpu_online(rdp);
+}
+
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
@@ -5029,6 +5043,7 @@ static void __init kfree_rcu_batch_init(void)
{
int cpu;
int i, j;
+ struct shrinker *kfree_rcu_shrinker;
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -5060,8 +5075,17 @@ static void __init kfree_rcu_batch_init(void)
INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
- if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree"))
- pr_err("Failed to register kfree_rcu() shrinker!\n");
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
}
void __init rcu_init(void)
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 5598212d1f27..4efbf7333d4e 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1396,13 +1396,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
return count ? count : SHRINK_STOP;
}
-
-static struct shrinker lazy_rcu_shrinker = {
- .count_objects = lazy_rcu_shrink_count,
- .scan_objects = lazy_rcu_shrink_scan,
- .batch = 0,
- .seeks = DEFAULT_SEEKS,
-};
#endif // #ifdef CONFIG_RCU_LAZY
void __init rcu_init_nohz(void)
@@ -1410,6 +1403,7 @@ void __init rcu_init_nohz(void)
int cpu;
struct rcu_data *rdp;
const struct cpumask *cpumask = NULL;
+ struct shrinker * __maybe_unused lazy_rcu_shrinker;
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
@@ -1436,8 +1430,15 @@ void __init rcu_init_nohz(void)
return;
#ifdef CONFIG_RCU_LAZY
- if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
- pr_err("Failed to register lazy_rcu shrinker!\n");
+ lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy");
+ if (!lazy_rcu_shrinker) {
+ pr_err("Failed to allocate lazy_rcu shrinker!\n");
+ } else {
+ lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count;
+ lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan;
+
+ shrinker_register(lazy_rcu_shrinker);
+ }
#endif // #ifdef CONFIG_RCU_LAZY
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3bba88c7ffc6..395a0ea3c7a8 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -55,6 +55,7 @@ struct sys_off_handler {
enum sys_off_mode mode;
bool blocking;
void *list;
+ struct device *dev;
};
/*
@@ -74,6 +75,7 @@ void __weak (*pm_power_off)(void);
void emergency_restart(void)
{
kmsg_dump(KMSG_DUMP_EMERG);
+ system_state = SYSTEM_RESTART;
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
@@ -323,6 +325,7 @@ static int sys_off_notify(struct notifier_block *nb,
data.cb_data = handler->cb_data;
data.mode = mode;
data.cmd = cmd;
+ data.dev = handler->dev;
return handler->sys_off_cb(&data);
}
@@ -510,6 +513,7 @@ int devm_register_sys_off_handler(struct device *dev,
handler = register_sys_off_handler(mode, priority, callback, cb_data);
if (IS_ERR(handler))
return PTR_ERR(handler);
+ handler->dev = dev;
return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
handler);
diff --git a/kernel/resource.c b/kernel/resource.c
index b1763b2fd7ef..866ef3663a0b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -56,33 +56,17 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
-static struct resource *next_resource(struct resource *p)
+static struct resource *next_resource(struct resource *p, bool skip_children)
{
- if (p->child)
+ if (!skip_children && p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
-static struct resource *next_resource_skip_children(struct resource *p)
-{
- while (!p->sibling && p->parent)
- p = p->parent;
- return p->sibling;
-}
-
#define for_each_resource(_root, _p, _skip_children) \
- for ((_p) = (_root)->child; (_p); \
- (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
- next_resource(_p))
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct resource *p = v;
- (*pos)++;
- return (void *)next_resource(p);
-}
+ for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children))
#ifdef CONFIG_PROC_FS
@@ -91,14 +75,28 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = pde_data(file_inode(m->file));
- loff_t l = 0;
+ struct resource *root = pde_data(file_inode(m->file));
+ struct resource *p;
+ loff_t l = *pos;
+
read_lock(&resource_lock);
- for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
- ;
+ for_each_resource(root, p, false) {
+ if (l-- == 0)
+ break;
+ }
+
return p;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+
+ (*pos)++;
+
+ return (void *)next_resource(p, false);
+}
+
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
@@ -336,7 +334,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p)) {
+ for_each_resource(&iomem_resource, p, false) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -1641,13 +1639,12 @@ __setup("reserve=", reserve_setup);
*/
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
- struct resource *p = &iomem_resource;
resource_size_t end = addr + size - 1;
+ struct resource *p;
int err = 0;
- loff_t l;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ for_each_resource(&iomem_resource, p, false) {
/*
* We can probably skip the resources without
* IORESOURCE_IO attribute?
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d7e2d702699..a708d225c28e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9869,7 +9869,7 @@ struct task_group root_task_group;
LIST_HEAD(task_groups);
/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
+static struct kmem_cache *task_group_cache __ro_after_init;
#endif
void __init sched_init(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8767988242ee..2048138ce54b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1759,12 +1759,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
* The smaller the hint page fault latency, the higher the possibility
* for the page to be hot.
*/
-static int numa_hint_fault_latency(struct page *page)
+static int numa_hint_fault_latency(struct folio *folio)
{
int last_time, time;
time = jiffies_to_msecs(jiffies);
- last_time = xchg_page_access_time(page, time);
+ last_time = folio_xchg_access_time(folio, time);
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
@@ -1821,7 +1821,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
}
}
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
@@ -1851,16 +1851,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
- latency = numa_hint_fault_latency(page);
+ latency = numa_hint_fault_latency(folio);
if (latency >= th)
return false;
return !numa_promotion_rate_limit(pgdat, rate_limit,
- thp_nr_pages(page));
+ folio_nr_pages(folio));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -58,13 +58,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
EXPORT_SYMBOL(remove_wait_queue);
/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
-/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake that number of exclusive tasks, and potentially all
@@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue);
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key,
- wait_queue_entry_t *bookmark)
+ int nr_exclusive, int wake_flags, void *key)
{
wait_queue_entry_t *curr, *next;
- int cnt = 0;
lockdep_assert_held(&wq_head->lock);
- if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
- curr = list_next_entry(bookmark, entry);
-
- list_del(&bookmark->entry);
- bookmark->flags = 0;
- } else
- curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
if (&curr->entry == &wq_head->head)
return nr_exclusive;
@@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
unsigned flags = curr->flags;
int ret;
- if (flags & WQ_FLAG_BOOKMARK)
- continue;
-
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
-
- if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
- (&next->entry != &wq_head->head)) {
- bookmark->flags = WQ_FLAG_BOOKMARK;
- list_add_tail(&bookmark->entry, &next->entry);
- break;
- }
}
return nr_exclusive;
@@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
- wait_queue_entry_t bookmark;
- int remaining = nr_exclusive;
+ int remaining;
- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
-
- do {
- spin_lock_irqsave(&wq_head->lock, flags);
- remaining = __wake_up_common(wq_head, mode, remaining,
- wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
- } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+ key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
return nr_exclusive - remaining;
}
@@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
- __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
diff --git a/kernel/signal.c b/kernel/signal.c
index 83fcbaf0e82d..47a7602dfe8d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -415,7 +415,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
- struct ucounts *ucounts = NULL;
+ struct ucounts *ucounts;
long sigpending;
/*
@@ -1058,12 +1058,11 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
- t = p;
- do {
+ __for_each_thread(signal, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- } while_each_thread(p, t);
+ }
return;
}
}
@@ -1471,16 +1470,21 @@ int group_send_sig_info(int sig, struct kernel_siginfo *info,
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
- int retval, success;
+ int ret = -ESRCH;
- success = 0;
- retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
- success |= !err;
- retval = err;
+ /*
+ * If group_send_sig_info() succeeds at least once ret
+ * becomes 0 and after that the code below has no effect.
+ * Otherwise we return the last err or -ESRCH if this
+ * process group is empty.
+ */
+ if (ret)
+ ret = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return success ? 0 : retval;
+
+ return ret;
}
int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f47d8f375946..1992b62e980b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
if (tsk) {
- kthread_stop(tsk);
- put_task_struct(tsk);
+ kthread_stop_put(tsk);
*per_cpu_ptr(ht->store, cpu) = NULL;
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..420d9cb9cc8e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,6 +1786,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
+ struct signal_struct *sig = p->signal;
memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
@@ -1793,7 +1794,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
- maxrss = p->signal->maxrss;
+ maxrss = sig->maxrss;
goto out;
}
@@ -1803,15 +1804,15 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
switch (who) {
case RUSAGE_BOTH:
case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
+ utime = sig->cutime;
+ stime = sig->cstime;
+ r->ru_nvcsw = sig->cnvcsw;
+ r->ru_nivcsw = sig->cnivcsw;
+ r->ru_minflt = sig->cmin_flt;
+ r->ru_majflt = sig->cmaj_flt;
+ r->ru_inblock = sig->cinblock;
+ r->ru_oublock = sig->coublock;
+ maxrss = sig->cmaxrss;
if (who == RUSAGE_CHILDREN)
break;
@@ -1821,18 +1822,16 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
utime += tgutime;
stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
+ r->ru_nvcsw += sig->nvcsw;
+ r->ru_nivcsw += sig->nivcsw;
+ r->ru_minflt += sig->min_flt;
+ r->ru_majflt += sig->maj_flt;
+ r->ru_inblock += sig->inblock;
+ r->ru_oublock += sig->oublock;
+ if (maxrss < sig->maxrss)
+ maxrss = sig->maxrss;
+ __for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
break;
default:
@@ -2368,19 +2367,41 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif /* CONFIG_ANON_VMA_NAME */
+static inline unsigned long get_current_mdwe(void)
+{
+ unsigned long ret = 0;
+
+ if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+ if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+ ret |= PR_MDWE_NO_INHERIT;
+
+ return ret;
+}
+
static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
+ unsigned long current_bits;
+
if (arg3 || arg4 || arg5)
return -EINVAL;
- if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+ return -EINVAL;
+
+ /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
+ current_bits = get_current_mdwe();
+ if (current_bits && current_bits != bits)
+ return -EPERM; /* Cannot unset the flags */
+
+ if (bits & PR_MDWE_NO_INHERIT)
+ set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
set_bit(MMF_HAS_MDWE, &current->mm->flags);
- else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
- return -EPERM; /* Cannot unset the flag */
return 0;
}
@@ -2390,9 +2411,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
{
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
-
- return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
- PR_MDWE_REFUSE_EXEC_GAIN : 0;
+ return get_current_mdwe();
}
static int prctl_get_auxv(void __user *addr, unsigned long len)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8ce3fa0c19e2..4354ea231fab 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -233,9 +233,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
else
memset(stats, 0, sizeof(*stats));
- tsk = first;
start_time = ktime_get_ns();
- do {
+ for_each_thread(first, tsk) {
if (tsk->exit_state)
continue;
/*
@@ -258,7 +257,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
- } while_each_thread(first, tsk);
+ }
unlock_task_sighand(first, &flags);
rc = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 515cafdb18d9..43cc47d7faaf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2056,7 +2056,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
retries = 10;
success = false;
while (retries--) {
- struct list_head *head_page, *prev_page, *r;
+ struct list_head *head_page, *prev_page;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
@@ -2075,9 +2075,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
last_page->next = head_page_with_bit;
first_page->prev = prev_page;
- r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-
- if (r == head_page_with_bit) {
+ /* caution: head_page_with_bit gets updated on cmpxchg failure */
+ if (try_cmpxchg(&prev_page->next,
+ &head_page_with_bit, first_page)) {
/*
* yay, we replaced the page pointer to our new list,
* now, we just have to update to head page's prev
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abaaf516fcae..9aebf904ff97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -54,12 +54,6 @@
#include "trace.h"
#include "trace_output.h"
-/*
- * On boot up, the ring buffer is set to the minimum size, so that
- * we do not waste memory on systems that are not using tracing.
- */
-bool ring_buffer_expanded;
-
#ifdef CONFIG_FTRACE_STARTUP_TEST
/*
* We need to change this state when a selftest is running.
@@ -202,7 +196,7 @@ static int __init set_cmdline_ftrace(char *str)
strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
return 1;
}
__setup("ftrace=", set_cmdline_ftrace);
@@ -247,7 +241,7 @@ static int __init boot_alloc_snapshot(char *str)
} else {
allocate_snapshot = true;
/* We also need the main ring buffer expanded */
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
}
return 1;
}
@@ -490,6 +484,13 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+void trace_set_ring_buffer_expanded(struct trace_array *tr)
+{
+ if (!tr)
+ tr = &global_trace;
+ tr->ring_buffer_expanded = true;
+}
+
LIST_HEAD(ftrace_trace_arrays);
int trace_array_get(struct trace_array *this_tr)
@@ -1730,15 +1731,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (trace_seq_used(s) <= s->seq.readpos)
+ if (trace_seq_used(s) <= s->readpos)
return -EBUSY;
- len = trace_seq_used(s) - s->seq.readpos;
+ len = trace_seq_used(s) - s->readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->seq.readpos, cnt);
+ memcpy(buf, s->buffer + s->readpos, cnt);
- s->seq.readpos += cnt;
+ s->readpos += cnt;
return cnt;
}
@@ -2012,7 +2013,7 @@ static int run_tracer_selftest(struct tracer *type)
#ifdef CONFIG_TRACER_MAX_TRACE
if (type->use_max_tr) {
/* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
RING_BUFFER_ALL_CPUS);
tr->allocated_snapshot = true;
@@ -2038,7 +2039,7 @@ static int run_tracer_selftest(struct tracer *type)
tr->allocated_snapshot = false;
/* Shrink the max buffer again */
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
ring_buffer_resize(tr->max_buffer.buffer, 1,
RING_BUFFER_ALL_CPUS);
}
@@ -3403,7 +3404,7 @@ void trace_printk_init_buffers(void)
pr_warn("**********************************************************\n");
/* Expand the buffers to set size */
- tracing_update_buffers();
+ tracing_update_buffers(&global_trace);
buffers_allocated = 1;
@@ -3827,15 +3828,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
return false;
}
-static const char *show_buffer(struct trace_seq *s)
-{
- struct seq_buf *seq = &s->seq;
-
- seq_buf_terminate(seq);
-
- return seq->buffer;
-}
-
static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
static int test_can_verify_check(const char *fmt, ...)
@@ -3975,7 +3967,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
*/
if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
"fmt: '%s' current_buffer: '%s'",
- fmt, show_buffer(&iter->seq))) {
+ fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
/* Try to safely read the string */
@@ -4986,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ mutex_lock(&event_mutex);
+
+ /* Fail if the file is marked for removal */
+ if (file->flags & EVENT_FILE_FL_FREED) {
+ trace_array_put(file->tr);
+ ret = -ENODEV;
+ } else {
+ event_file_get(file);
+ }
+
+ mutex_unlock(&event_mutex);
+ if (ret)
+ return ret;
+
filp->private_data = inode->i_private;
return 0;
@@ -4996,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
struct trace_event_file *file = inode->i_private;
trace_array_put(file->tr);
+ event_file_put(file);
return 0;
}
@@ -6374,7 +6381,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
* we use the size that was given, and we can forget about
* expanding it later.
*/
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(tr);
/* May be called before buffers are initialized */
if (!tr->array_buffer.buffer)
@@ -6452,6 +6459,7 @@ out:
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
+ * @tr: The tracing instance
*
* To save on memory when the tracing is never used on a system with it
* configured in. The ring buffers are set to a minimum size. But once
@@ -6460,13 +6468,13 @@ out:
*
* This function is to be called when a tracer is about to be used.
*/
-int tracing_update_buffers(void)
+int tracing_update_buffers(struct trace_array *tr)
{
int ret = 0;
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
+ if (!tr->ring_buffer_expanded)
+ ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);
@@ -6520,7 +6528,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded) {
+ if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
@@ -7006,7 +7014,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
+ if (iter->seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -7192,7 +7200,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
}
if (buf_size_same) {
- if (!ring_buffer_expanded)
+ if (!tr->ring_buffer_expanded)
r = sprintf(buf, "%lu (expanded: %lu)\n",
size >> 10,
trace_buf_size >> 10);
@@ -7249,10 +7257,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
- if (!ring_buffer_expanded)
+ if (!tr->ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
r = sprintf(buf, "%lu\n", size);
else
r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
@@ -7646,7 +7654,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
unsigned long val;
int ret;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -9550,6 +9558,9 @@ static struct trace_array *trace_array_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
+ /* The ring buffer is defaultly expanded */
+ trace_set_ring_buffer_expanded(tr);
+
if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
@@ -9759,7 +9770,6 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct trace_event_file *file;
int cpu;
trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
@@ -9792,11 +9802,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
- file = __find_event_file(tr, "ftrace", "print");
- if (file && file->ef)
- eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
- file, &event_trigger_fops);
- tr->trace_marker_file = file;
+ tr->trace_marker_file = __find_event_file(tr, "ftrace", "print");
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
@@ -10444,7 +10450,7 @@ __init static int tracer_alloc_buffers(void)
trace_printk_init_buffers();
/* To save memory, keep the ring buffer size to its minimum */
- if (ring_buffer_expanded)
+ if (global_trace.ring_buffer_expanded)
ring_buf_size = trace_buf_size;
else
ring_buf_size = 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 77debe53f07c..b7f4ea25a194 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -381,7 +381,7 @@ struct trace_array {
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
- struct dentry *event_dir;
+ struct eventfs_inode *event_dir;
struct trace_options *topts;
struct list_head systems;
struct list_head events;
@@ -410,6 +410,11 @@ struct trace_array {
struct cond_snapshot *cond_snapshot;
#endif
struct trace_func_repeats __percpu *last_func_repeats;
+ /*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+ bool ring_buffer_expanded;
};
enum {
@@ -761,7 +766,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
-extern bool ring_buffer_expanded;
+extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
extern bool tracing_selftest_disabled;
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1305,7 +1310,7 @@ static inline void trace_branch_disable(void)
#endif /* CONFIG_BRANCH_TRACER */
/* set ring buffers to default size if not already done so */
-int tracing_update_buffers(void);
+int tracing_update_buffers(struct trace_array *tr);
union trace_synth_field {
u8 as_u8;
@@ -1344,7 +1349,7 @@ struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
- struct eventfs_file *ef;
+ struct eventfs_inode *ei;
int ref_count;
int nr_events;
};
@@ -1664,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops,
char *glob,
struct event_trigger_data *trigger_data);
+extern void event_file_get(struct trace_event_file *file);
+extern void event_file_put(struct trace_event_file *file);
+
/**
* struct event_trigger_ops - callbacks for trace event triggers
*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f49d6ddb6342..f29e815ca5b2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -984,19 +984,41 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- eventfs_remove(dir->ef);
+ eventfs_remove_dir(dir->ei);
list_del(&dir->list);
__put_system_dir(dir);
}
}
+void event_file_get(struct trace_event_file *file)
+{
+ atomic_inc(&file->ref);
+}
+
+void event_file_put(struct trace_event_file *file)
+{
+ if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ kmem_cache_free(file_cachep, file);
+ return;
+ }
+
+ if (atomic_dec_and_test(&file->ref)) {
+ /* Count should only go to zero when it is freed */
+ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
+ return;
+ kmem_cache_free(file_cachep, file);
+ }
+}
+
static void remove_event_file_dir(struct trace_event_file *file)
{
- eventfs_remove(file->ef);
+ eventfs_remove_dir(file->ei);
list_del(&file->list);
remove_subsystem(file->system);
free_event_filter(file->filter);
- kmem_cache_free(file_cachep, file);
+ file->flags |= EVENT_FILE_FL_FREED;
+ event_file_put(file);
}
/*
@@ -1166,7 +1188,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
if (!cnt)
return 0;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file)
+ if (!file || flags & EVENT_FILE_FL_FREED)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1397,18 +1419,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- ret = tracing_update_buffers();
- if (ret < 0)
- return ret;
-
switch (val) {
case 0:
case 1:
ret = -ENODEV;
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (likely(file))
+ if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0) {
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
ret = ftrace_event_enable_disable(file, val);
+ }
mutex_unlock(&event_mutex);
break;
@@ -1482,7 +1506,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(dir->tr);
if (ret < 0)
return ret;
@@ -1681,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (file)
+ if (file && !(file->flags & EVENT_FILE_FL_FREED))
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1956,7 +1980,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -2280,14 +2304,40 @@ create_new_subsystem(const char *name)
return NULL;
}
-static struct eventfs_file *
+static int system_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (strcmp(name, "filter") == 0)
+ *fops = &ftrace_subsystem_filter_fops;
+
+ else if (strcmp(name, "enable") == 0)
+ *fops = &ftrace_system_enable_fops;
+
+ else
+ return 0;
+
+ *mode = TRACE_MODE_WRITE;
+ return 1;
+}
+
+static struct eventfs_inode *
event_subsystem_dir(struct trace_array *tr, const char *name,
- struct trace_event_file *file, struct dentry *parent)
+ struct trace_event_file *file, struct eventfs_inode *parent)
{
struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- struct eventfs_file *ef;
- int res;
+ struct eventfs_inode *ei;
+ int nr_entries;
+ static struct eventfs_entry system_entries[] = {
+ {
+ .name = "filter",
+ .callback = system_callback,
+ },
+ {
+ .name = "enable",
+ .callback = system_callback,
+ }
+ };
/* First see if we did not already create this dir */
list_for_each_entry(dir, &tr->systems, list) {
@@ -2295,7 +2345,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (strcmp(system->name, name) == 0) {
dir->nr_events++;
file->system = dir;
- return dir->ef;
+ return dir->ei;
}
}
@@ -2319,39 +2369,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- ef = eventfs_add_subsystem_dir(name, parent);
- if (IS_ERR(ef)) {
+ /* ftrace only has directories no files */
+ if (strcmp(name, "ftrace") == 0)
+ nr_entries = 0;
+ else
+ nr_entries = ARRAY_SIZE(system_entries);
+
+ ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
+ if (IS_ERR(ei)) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
- dir->ef = ef;
+ dir->ei = ei;
dir->tr = tr;
dir->ref_count = 1;
dir->nr_events = 1;
dir->subsystem = system;
file->system = dir;
- /* the ftrace system is special, do not create enable or filter files */
- if (strcmp(name, "ftrace") != 0) {
-
- res = eventfs_add_file("filter", TRACE_MODE_WRITE,
- dir->ef, dir,
- &ftrace_subsystem_filter_fops);
- if (res) {
- kfree(system->filter);
- system->filter = NULL;
- pr_warn("Could not create tracefs '%s/filter' entry\n", name);
- }
-
- eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
- &ftrace_system_enable_fops);
- }
-
list_add(&dir->list, &tr->systems);
- return dir->ef;
+ return dir->ei;
out_free:
kfree(dir);
@@ -2400,15 +2440,134 @@ event_define_fields(struct trace_event_call *call)
return ret;
}
+static int event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ struct trace_event_file *file = *data;
+ struct trace_event_call *call = file->event_call;
+
+ if (strcmp(name, "format") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_event_format_fops;
+ *data = call;
+ return 1;
+ }
+
+ /*
+ * Only event directories that can be enabled should have
+ * triggers or filters, with the exception of the "print"
+ * event that can have a "trigger" file.
+ */
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+ if (call->class->reg && strcmp(name, "enable") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_enable_fops;
+ return 1;
+ }
+
+ if (strcmp(name, "filter") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_event_filter_fops;
+ return 1;
+ }
+ }
+
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+ strcmp(trace_event_name(call), "print") == 0) {
+ if (strcmp(name, "trigger") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &event_trigger_fops;
+ return 1;
+ }
+ }
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && call->class->reg &&
+ strcmp(name, "id") == 0) {
+ *mode = TRACE_MODE_READ;
+ *data = (void *)(long)call->event.type;
+ *fops = &ftrace_event_id_fops;
+ return 1;
+ }
+#endif
+
+#ifdef CONFIG_HIST_TRIGGERS
+ if (strcmp(name, "hist") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &event_hist_fops;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ if (strcmp(name, "hist_debug") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &event_hist_debug_fops;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+ if (call->event.type && call->class->reg &&
+ strcmp(name, "inject") == 0) {
+ *mode = 0200;
+ *fops = &event_inject_fops;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
static int
-event_create_dir(struct dentry *parent, struct trace_event_file *file)
+event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
struct trace_event_call *call = file->event_call;
- struct eventfs_file *ef_subsystem = NULL;
struct trace_array *tr = file->tr;
- struct eventfs_file *ef;
+ struct eventfs_inode *e_events;
+ struct eventfs_inode *ei;
const char *name;
+ int nr_entries;
int ret;
+ static struct eventfs_entry event_entries[] = {
+ {
+ .name = "enable",
+ .callback = event_callback,
+ },
+ {
+ .name = "filter",
+ .callback = event_callback,
+ },
+ {
+ .name = "trigger",
+ .callback = event_callback,
+ },
+ {
+ .name = "format",
+ .callback = event_callback,
+ },
+#ifdef CONFIG_PERF_EVENTS
+ {
+ .name = "id",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+ {
+ .name = "hist",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ {
+ .name = "hist_debug",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+ {
+ .name = "inject",
+ .callback = event_callback,
+ },
+#endif
+ };
/*
* If the trace point header did not define TRACE_SYSTEM
@@ -2418,29 +2577,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
return -ENODEV;
- ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
- if (!ef_subsystem)
+ e_events = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!e_events)
return -ENOMEM;
+ nr_entries = ARRAY_SIZE(event_entries);
+
name = trace_event_name(call);
- ef = eventfs_add_dir(name, ef_subsystem);
- if (IS_ERR(ef)) {
+ ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
+ if (IS_ERR(ei)) {
pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
- file->ef = ef;
-
- if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
- &ftrace_enable_fops);
-
-#ifdef CONFIG_PERF_EVENTS
- if (call->event.type && call->class->reg)
- eventfs_add_file("id", TRACE_MODE_READ, file->ef,
- (void *)(long)call->event.type,
- &ftrace_event_id_fops);
-#endif
+ file->ei = ei;
ret = event_define_fields(call);
if (ret < 0) {
@@ -2448,35 +2598,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
return ret;
}
- /*
- * Only event directories that can be enabled should have
- * triggers or filters.
- */
- if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
- file, &ftrace_event_filter_fops);
-
- eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
- file, &event_trigger_fops);
- }
-
-#ifdef CONFIG_HIST_TRIGGERS
- eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
- &event_hist_fops);
-#endif
-#ifdef CONFIG_HIST_TRIGGERS_DEBUG
- eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
- &event_hist_debug_fops);
-#endif
- eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
- &ftrace_event_format_fops);
-
-#ifdef CONFIG_TRACE_EVENT_INJECT
- if (call->event.type && call->class->reg)
- eventfs_add_file("inject", 0200, file->ef, file,
- &event_inject_fops);
-#endif
-
return 0;
}
@@ -2803,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
+ event_file_get(file);
return file;
}
@@ -2824,7 +2946,7 @@ static __init int setup_trace_triggers(char *str)
int i;
strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event triggers");
buf = bootup_trigger_buf;
@@ -3614,37 +3736,72 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event tracing");
return 1;
}
__setup("trace_event=", setup_trace_event);
+static int events_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (strcmp(name, "enable") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_tr_enable_fops;
+ return 1;
+ }
+
+ if (strcmp(name, "header_page") == 0)
+ *data = ring_buffer_print_page_header;
+
+ else if (strcmp(name, "header_event") == 0)
+ *data = ring_buffer_print_entry_header;
+
+ else
+ return 0;
+
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_fops;
+ return 1;
+}
+
/* Expects to have event_mutex held when called */
static int
create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
- struct dentry *d_events;
+ struct eventfs_inode *e_events;
struct dentry *entry;
- int error = 0;
+ int nr_entries;
+ static struct eventfs_entry events_entries[] = {
+ {
+ .name = "enable",
+ .callback = events_callback,
+ },
+ {
+ .name = "header_page",
+ .callback = events_callback,
+ },
+ {
+ .name = "header_event",
+ .callback = events_callback,
+ },
+ };
entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_fops);
if (!entry)
return -ENOMEM;
- d_events = eventfs_create_events_dir("events", parent);
- if (IS_ERR(d_events)) {
+ nr_entries = ARRAY_SIZE(events_entries);
+
+ e_events = eventfs_create_events_dir("events", parent, events_entries,
+ nr_entries, tr);
+ if (IS_ERR(e_events)) {
pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
- error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events,
- tr, &ftrace_tr_enable_fops);
- if (error)
- return -ENOMEM;
-
/* There are not as crucial, just warn if they are not created */
trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
@@ -3654,16 +3811,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
TRACE_MODE_WRITE, parent, tr,
&ftrace_set_event_notrace_pid_fops);
- /* ring buffer internal formats */
- eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events,
- ring_buffer_print_page_header,
- &ftrace_show_header_fops);
-
- eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events,
- ring_buffer_print_entry_header,
- &ftrace_show_header_fops);
-
- tr->event_dir = d_events;
+ tr->event_dir = e_events;
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 33264e510d16..0c611b281a5b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
struct event_filter *filter = NULL;
int err;
+ if (file->flags & EVENT_FILE_FL_FREED)
+ return -ENODEV;
+
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(file);
filter = event_filter(file);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index d06938ae0717..1abc07fba1b9 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -774,23 +774,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
- int len;
if (!str)
return;
- /* sizeof() contains the nul byte */
- len = sizeof(HIST_PREFIX) + strlen(str);
kfree(last_cmd);
- last_cmd = kzalloc(len, GFP_KERNEL);
+
+ last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str);
if (!last_cmd)
return;
- strcpy(last_cmd, HIST_PREFIX);
- /* Again, sizeof() contains the nul byte */
- len -= sizeof(HIST_PREFIX);
- strncat(last_cmd, str, len);
-
if (file) {
call = file->event_call;
system = call->class->system;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 14cb275a0bab..846e02c0fb59 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -452,7 +452,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)str_val < TASK_SIZE)
- ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+ ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX);
else
#endif
ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b87f41187c6a..9365ce407426 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -50,18 +50,6 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
- * User register flags are not allowed yet, keep them here until we are
- * ready to expose them out to the user ABI.
- */
-enum user_reg_flag {
- /* Event will not delete upon last reference closing */
- USER_EVENT_REG_PERSIST = 1U << 0,
-
- /* This value or above is currently non-ABI */
- USER_EVENT_REG_MAX = 1U << 1,
-};
-
-/*
* Stores the system name, tables, and locks for a group of events. This
* allows isolation for events by various means.
*/
@@ -220,6 +208,17 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}
+static bool user_event_capable(u16 reg_flags)
+{
+ /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */
+ if (reg_flags & USER_EVENT_REG_PERSIST) {
+ if (!perfmon_capable())
+ return false;
+ }
+
+ return true;
+}
+
static struct user_event *user_event_get(struct user_event *user)
{
refcount_inc(&user->refcnt);
@@ -1811,6 +1810,9 @@ static int user_event_free(struct dyn_event *ev)
if (!user_event_last_ref(user))
return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
+
return destroy_user_event(user);
}
@@ -1926,10 +1928,13 @@ static int user_event_parse(struct user_event_group *group, char *name,
int argc = 0;
char **argv;
- /* User register flags are not ready yet */
- if (reg_flags != 0 || flags != NULL)
+ /* Currently don't support any text based flags */
+ if (flags != NULL)
return -EINVAL;
+ if (!user_event_capable(reg_flags))
+ return -EPERM;
+
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
user = find_user_event(group, name, &key);
@@ -2062,6 +2067,9 @@ static int delete_user_event(struct user_event_group *group, char *name)
if (!user_event_last_ref(user))
return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
+
return destroy_user_event(user);
}
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index bac06ee3b98b..7be97229ddf8 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -370,8 +370,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*/
int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
{
+ int ret;
__trace_seq_init(s);
- return seq_buf_to_user(&s->seq, ubuf, cnt);
+ ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt);
+ if (ret > 0)
+ s->readpos += ret;
+ return ret;
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d52a894ecf57..eabe8bcc7042 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,7 +22,7 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
-static struct kmem_cache *user_ns_cachep __read_mostly;
+static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d145305d95fe..5cd6d4e26915 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -283,6 +283,13 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static unsigned long soft_lockup_nmi_warn;
+static int __init softlockup_panic_setup(char *str)
+{
+ softlockup_panic = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
static int __init nowatchdog_setup(char *str)
{
watchdog_user_enabled = 0;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0f682da96e1c..6e578f576a6f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -418,21 +418,21 @@ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
* process context while holding a pool lock. Bounce to a dedicated kthread
* worker to avoid A-A deadlocks.
*/
-static struct kthread_worker *pwq_release_worker;
+static struct kthread_worker *pwq_release_worker __ro_after_init;
-struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
-struct workqueue_struct *system_highpri_wq __read_mostly;
+struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
-struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
-struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
-struct workqueue_struct *system_freezable_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
-struct workqueue_struct *system_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
-struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);