diff options
Diffstat (limited to 'kernel')
76 files changed, 2794 insertions, 981 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..b74820d8b264 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 78dfff6a501b..7df28a45c66b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info { struct bpf_map *map; struct bpf_htab *htab; void *percpu_value_buf; // non-zero means percpu hash - unsigned long flags; u32 bucket_id; u32 skip_elems; }; @@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, struct htab_elem *prev_elem) { const struct bpf_htab *htab = info->htab; - unsigned long flags = info->flags; u32 skip_elems = info->skip_elems; u32 bucket_id = info->bucket_id; struct hlist_nulls_head *head; @@ -1656,19 +1654,18 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, /* not found, unlock and go to the next bucket */ b = &htab->buckets[bucket_id++]; - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } for (i = bucket_id; i < htab->n_buckets; i++) { b = &htab->buckets[i]; - flags = htab_lock_bucket(htab, b); + rcu_read_lock(); count = 0; head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) { - info->flags = flags; info->bucket_id = i; info->skip_elems = count; return elem; @@ -1676,7 +1673,7 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, count++; } - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } @@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) { - struct bpf_iter_seq_hash_map_info *info = seq->private; - if (!v) (void)__bpf_hash_map_seq_show(seq, NULL); else - htab_unlock_bucket(info->htab, - &info->htab->buckets[info->bucket_id], - info->flags); + rcu_read_unlock(); } static int bpf_iter_init_hash_map(void *priv_data, diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fb878ba3f22f..18f4969552ac 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -226,10 +226,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) else prev_key = key; + rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; - return NULL; + key = NULL; } + rcu_read_unlock(); return key; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1bf960aa615c..b999e7ff2583 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2634,7 +2634,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); - if (ulen && !ubuf) + if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 3b495773de5a..11b3380887fa 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,15 +30,15 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!__start_BTF) + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; + + if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47e74f09fa37..fba52d9ec8fc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5667,8 +5667,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->smin_value; - u32 umin_val = src_reg->umin_value; + s32 smin_val = src_reg->s32_min_value; + u32 umin_val = src_reg->u32_min_value; /* Assuming scalar64_min_max_or will be called so it is safe * to skip updating register for known case. @@ -5691,8 +5691,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* ORing two positives gives a positive, so safe to * cast result into s64. */ - dst_reg->s32_min_value = dst_reg->umin_value; - dst_reg->s32_max_value = dst_reg->umax_value; + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; } } diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 44a259338e33..f7e1d0eccdbc 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -15,18 +15,28 @@ static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * __atomic_notifier_call_chain has a RCU read critical section, which + * atomic_notifier_call_chain has a RCU read critical section, which * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let * RCU know this. */ rcu_irq_enter_irqson(); - ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, - nr_to_call, nr_calls); + ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_irq_exit_irqson(); + + return notifier_to_errno(ret); +} + +static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) +{ + int ret; + + rcu_irq_enter_irqson(); + ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -80,18 +90,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls = 0; - int ret = 0; - - ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU PM - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); - - return ret; + return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED); } EXPORT_SYMBOL_GPL(cpu_pm_enter); @@ -109,7 +108,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter); */ int cpu_pm_exit(void) { - return cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + return cpu_pm_notify(CPU_PM_EXIT); } EXPORT_SYMBOL_GPL(cpu_pm_exit); @@ -131,18 +130,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls = 0; - int ret = 0; - - ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU cluster - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); - - return ret; + return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); @@ -163,7 +151,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); */ int cpu_cluster_pm_exit(void) { - return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + return cpu_pm_notify(CPU_CLUSTER_PM_EXIT); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fcae019158ca..145ab11b8318 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -60,31 +60,56 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret; } + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); syscall_enter_audit(regs, syscall); - /* The above might have changed the syscall number */ - return ret ? : syscall_get_nr(current, regs); + return ret ? : syscall; } -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +static __always_inline long +__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long ti_work; - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); ti_work = READ_ONCE(current_thread_info()->flags); if (ti_work & SYSCALL_ENTER_WORK) syscall = syscall_trace_enter(regs, syscall, ti_work); - instrumentation_end(); return syscall; } +long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + return __syscall_enter_from_user_work(regs, syscall); +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = __syscall_enter_from_user_work(regs, syscall); + instrumentation_end(); + + return ret; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -183,7 +208,7 @@ static inline bool report_single_step(unsigned long ti_work) /* * If TIF_SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_usermode(). + * instruction has been already reported in syscall_enter_from_user_mode(). */ #define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) diff --git a/kernel/events/core.c b/kernel/events/core.c index 7ed5248f0445..da467e1dd49a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -99,7 +99,7 @@ static void remote_function(void *data) * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * - * returns @func return value or -ESRCH when the process isn't running + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -115,7 +115,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - ret = !ret ? data.ret : -EAGAIN; + if (!ret) + ret = data.ret; if (ret != -EAGAIN) break; @@ -382,7 +383,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -2133,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event) return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; } +/* + * Events that have PERF_EV_CAP_SIBLING require being part of a group and + * cannot exist on their own, schedule them out and move them into the ERROR + * state. Also see _perf_event_enable(), it will not be able to recover + * this ERROR state. + */ +static inline void perf_remove_sibling_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + event_sched_out(event, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); +} + static void perf_group_detach(struct perf_event *event) { + struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; @@ -2153,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event) /* * If this is a sibling, remove it from its group. */ - if (event->group_leader != event) { + if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; @@ -2166,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + if (sibling->event_caps & PERF_EV_CAP_SIBLING) + perf_remove_sibling_event(sibling); + sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2183,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event) } out: - perf_event__header_size(event->group_leader); - - for_each_sibling_event(tmp, event->group_leader) + for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); + + perf_event__header_size(leader); } static bool is_orphaned_event(struct perf_event *event) @@ -2979,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { +out: raw_spin_unlock_irq(&ctx->lock); return; } @@ -2990,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event) * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) { + /* + * Detached SIBLING events cannot leave ERROR state. + */ + if (event->event_caps & PERF_EV_CAP_SIBLING && + event->group_leader == event) + goto out; + event->state = PERF_EVENT_STATE_OFF; + } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); @@ -3356,10 +3384,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; + struct pmu *pmu; if (likely(!ctx)) return; + pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -3389,11 +3419,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - struct pmu *pmu = ctx->pmu; WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); + /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such @@ -3405,6 +3439,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, else swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_pmu_enable(pmu); + /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of @@ -3427,21 +3463,22 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + + perf_pmu_enable(pmu); raw_spin_unlock(&ctx->lock); } } -static DEFINE_PER_CPU(struct list_head, sched_cb_list); - void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - this_cpu_dec(perf_sched_cb_usages); - - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + --cpuctx->sched_cb_usage; } @@ -3449,10 +3486,7 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); - - this_cpu_inc(perf_sched_cb_usages); + cpuctx->sched_cb_usage++; } /* @@ -3463,30 +3497,22 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void perf_pmu_sched_task(struct task_struct *prev, - struct task_struct *next, - bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) { - struct perf_cpu_context *cpuctx; struct pmu *pmu; - if (prev == next) - return; - - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ - if (WARN_ON_ONCE(!pmu->sched_task)) - continue; + if (WARN_ON_ONCE(!pmu->sched_task)) + return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(pmu); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_pmu_enable(pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_event_switch(struct task_struct *task, @@ -3511,9 +3537,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(task, next, false); - if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3745,10 +3768,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; + struct pmu *pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) + if (cpuctx->task_ctx == ctx) { + if (cpuctx->sched_cb_usage) + __perf_pmu_sched_task(cpuctx, true); return; + } perf_ctx_lock(cpuctx, ctx); /* @@ -3758,7 +3785,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3770,7 +3797,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); - perf_pmu_enable(ctx->pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(cpuctx->task_ctx, true); + + perf_pmu_enable(pmu); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -3813,9 +3844,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); - - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -5868,11 +5896,11 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -5903,7 +5931,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); } - atomic_dec(&rb->mmap_count); + if (atomic_dec_and_test(&rb->mmap_count)) + detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; @@ -5912,7 +5941,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) + if (!detach_rest) goto out_put; /* @@ -12828,7 +12857,6 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif - INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..a3795aaaab5c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -95,6 +95,7 @@ #include <linux/stackleak.h> #include <linux/kasan.h> #include <linux/scs.h> +#include <linux/io_uring.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -589,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); @@ -1011,6 +1013,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); @@ -1982,6 +1985,10 @@ static __latent_entropy struct task_struct *copy_process( p->vtime.state = VTIME_INACTIVE; #endif +#ifdef CONFIG_IO_URING + p->io_uring = NULL; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif @@ -3014,7 +3021,7 @@ int unshare_files(struct files_struct **displaced) } int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 908fdf5098c3..53c67c87f141 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -19,7 +19,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if (__GNUC__ >= 7) +#if (__GNUC__ >= 10) +#define GCOV_COUNTERS 8 +#elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 #elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 857f5f4c8098..b9b9618e1aca 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -945,6 +945,33 @@ void handle_percpu_devid_irq(struct irq_desc *desc) } /** + * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * The biggest difference with the IRQ version is that the interrupt is + * EOIed early, as the IPI could result in a context switch, and we need to + * make sure the IPI can fire again. We also assume that the arch code has + * registered an action. If not, we are positively doomed. + */ +void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + __kstat_incr_irqs_this_cpu(desc); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); +} + +/** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids * @desc: the interrupt description structure for this irq @@ -1541,18 +1568,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); */ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_data *pos = NULL; + struct irq_data *pos; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - for (; data; data = data->parent_data) -#endif + for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) { if (data->chip && data->chip->irq_compose_msi_msg) pos = data; + } + if (!pos) return -ENOSYS; pos->chip->irq_compose_msi_msg(pos, msg); - return 0; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b95ff5d5f4bd..e4cff358b437 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -57,6 +57,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), + BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), }; static void @@ -125,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), + + BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), }; static const struct irq_bit_descr irqdesc_states[] = { @@ -136,6 +139,7 @@ static const struct irq_bit_descr irqdesc_states[] = { BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), BIT_MASK_DESCR(_IRQ_IS_POLLED), BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), + BIT_MASK_DESCR(_IRQ_HIDDEN), }; static const struct irq_bit_descr irqdesc_istates[] = { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7db284b10ac9..54363527feea 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data) } #endif +static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irqd->parent_data; +#else + return NULL; +#endif +} + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include <linux/debugfs.h> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 76cd7ebd1178..cf8b374b892d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1136,6 +1136,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, return irq_data; } +static void __irq_domain_free_hierarchy(struct irq_data *irq_data) +{ + struct irq_data *tmp; + + while (irq_data) { + tmp = irq_data; + irq_data = irq_data->parent_data; + kfree(tmp); + } +} + static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; @@ -1147,12 +1158,83 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) irq_data->parent_data = NULL; irq_data->domain = NULL; - while (tmp) { - irq_data = tmp; - tmp = tmp->parent_data; - kfree(irq_data); + __irq_domain_free_hierarchy(tmp); + } +} + +/** + * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy + * @domain: IRQ domain from which the hierarchy is to be disconnected + * @virq: IRQ number where the hierarchy is to be trimmed + * + * Marks the @virq level belonging to @domain as disconnected. + * Returns -EINVAL if @virq doesn't have a valid irq_data pointing + * to @domain. + * + * Its only use is to be able to trim levels of hierarchy that do not + * have any real meaning for this interrupt, and that the driver marks + * as such from its .alloc() callback. + */ +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + if (!irqd) + return -EINVAL; + + irqd->chip = ERR_PTR(-ENOTCONN); + return 0; +} + +static int irq_domain_trim_hierarchy(unsigned int virq) +{ + struct irq_data *tail, *irqd, *irq_data; + + irq_data = irq_get_irq_data(virq); + tail = NULL; + + /* The first entry must have a valid irqchip */ + if (!irq_data->chip || IS_ERR(irq_data->chip)) + return -EINVAL; + + /* + * Validate that the irq_data chain is sane in the presence of + * a hierarchy trimming marker. + */ + for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { + /* Can't have a valid irqchip after a trim marker */ + if (irqd->chip && tail) + return -EINVAL; + + /* Can't have an empty irqchip before a trim marker */ + if (!irqd->chip && !tail) + return -EINVAL; + + if (IS_ERR(irqd->chip)) { + /* Only -ENOTCONN is a valid trim marker */ + if (PTR_ERR(irqd->chip) != -ENOTCONN) + return -EINVAL; + + tail = irq_data; } } + + /* No trim marker, nothing to do */ + if (!tail) + return 0; + + pr_info("IRQ%d: trimming hierarchy from %s\n", + virq, tail->parent_data->domain->name); + + /* Sever the inner part of the hierarchy... */ + irqd = tail; + tail = tail->parent_data; + irqd->parent_data = NULL; + __irq_domain_free_hierarchy(tail); + + return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, @@ -1362,6 +1444,15 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, mutex_unlock(&irq_domain_mutex); goto out_free_irq_data; } + + for (i = 0; i < nr_irqs; i++) { + ret = irq_domain_trim_hierarchy(virq + i); + if (ret) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + } + for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); mutex_unlock(&irq_domain_mutex); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index eb95f6106a1e..2c0c4d6d0f83 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -187,7 +187,6 @@ static const struct irq_domain_ops msi_domain_ops = { .deactivate = msi_domain_deactivate, }; -#ifdef GENERIC_MSI_DOMAIN_OPS static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, msi_alloc_info_t *arg) { @@ -206,11 +205,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, { arg->desc = desc; } -#else -#define msi_domain_ops_get_hwirq NULL -#define msi_domain_ops_prepare NULL -#define msi_domain_ops_set_desc NULL -#endif /* !GENERIC_MSI_DOMAIN_OPS */ static int msi_domain_ops_init(struct irq_domain *domain, struct msi_domain_info *info, @@ -235,11 +229,13 @@ static int msi_domain_ops_check(struct irq_domain *domain, } static struct msi_domain_ops msi_domain_ops_default = { - .get_hwirq = msi_domain_ops_get_hwirq, - .msi_init = msi_domain_ops_init, - .msi_check = msi_domain_ops_check, - .msi_prepare = msi_domain_ops_prepare, - .set_desc = msi_domain_ops_set_desc, + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, + .domain_alloc_irqs = __msi_domain_alloc_irqs, + .domain_free_irqs = __msi_domain_free_irqs, }; static void msi_domain_update_dom_ops(struct msi_domain_info *info) @@ -251,6 +247,14 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) return; } + if (ops->domain_alloc_irqs == NULL) + ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs; + if (ops->domain_free_irqs == NULL) + ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs; + + if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS)) + return; + if (ops->get_hwirq == NULL) ops->get_hwirq = msi_domain_ops_default.get_hwirq; if (ops->msi_init == NULL) @@ -284,8 +288,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, { struct irq_domain *domain; - if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) - msi_domain_update_dom_ops(info); + msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); @@ -370,8 +373,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, { struct msi_desc *desc; - if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + switch(domain->bus_token) { + case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_VMD_MSI: + break; + default: return false; + } if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; @@ -387,17 +395,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; } -/** - * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from - * @dev: Pointer to device struct of the device for which the interrupts - * are allocated - * @nvec: The number of interrupts to allocate - * - * Returns 0 on success or an error code. - */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; @@ -491,12 +490,24 @@ cleanup: } /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev - * @domain: The domain to managing the interrupts + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + + return ops->domain_alloc_irqs(domain, dev, nvec); +} + +void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { struct msi_desc *desc; @@ -514,6 +525,20 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } /** + * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + + return ops->domain_free_irqs(domain, dev); +} + +/** * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from * diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c6c7e187ae74..ce0adb22ee96 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { + unsigned long chipflags = irq_desc_get_chip(desc)->flags; + struct irq_data *irqd = &desc->irq_data; + if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; - if (irqd_is_wakeup_set(&desc->irq_data)) { - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + if (irqd_is_wakeup_set(irqd)) { + irqd_set(irqd, IRQD_WAKEUP_ARMED); + + if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && + irqd_irq_disabled(irqd)) { + /* + * Interrupt marked for wakeup is in disabled state. + * Enable interrupt here to unmask/enable in irqchip + * to be able to resume with such interrupts. + */ + __enable_irq(desc); + irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the @@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc) * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ - if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } @@ -137,7 +151,19 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + struct irq_data *irqd = &desc->irq_data; + + irqd_clear(irqd, IRQD_WAKEUP_ARMED); + + if (irqd_is_enabled_on_suspend(irqd)) { + /* + * Interrupt marked for wakeup was enabled during suspend + * entry. Disable such interrupts to restore them back to + * original state. + */ + __disable_irq(desc); + irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } if (desc->istate & IRQS_SUSPENDED) goto resume; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 32c071d7bc03..72513ed2a5fc 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -485,7 +485,7 @@ int show_interrupts(struct seq_file *p, void *v) rcu_read_lock(); desc = irq_to_desc(i); - if (!desc) + if (!desc || irq_settings_is_hidden(desc)) goto outsparse; if (desc->kstat_irqs) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index c48ce19a257f..8ccd32a0cc80 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -86,6 +86,18 @@ static int irq_sw_resend(struct irq_desc *desc) } #endif +static int try_retrigger(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_retrigger) + return desc->irq_data.chip->irq_retrigger(&desc->irq_data); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_chip_retrigger_hierarchy(&desc->irq_data); +#else + return 0; +#endif +} + /* * IRQ resend * @@ -113,8 +125,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject) desc->istate &= ~IRQS_PENDING; - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successfull, mark it with the REPLAY bit */ diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index e43795cd2ccf..403378b9947b 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -17,6 +17,7 @@ enum { _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, + _IRQ_HIDDEN = IRQ_HIDDEN, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -31,6 +32,7 @@ enum { #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON +#define IRQ_HIDDEN GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -167,3 +169,8 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; } + +static inline bool irq_settings_is_hidden(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_HIDDEN; +} diff --git a/kernel/jump_label.c b/kernel/jump_label.c index cdb3ffab128b..e661c61b3d6b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -539,19 +539,25 @@ static void static_key_set_mod(struct static_key *key, static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + int ret; preempt_disable(); mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; preempt_enable(); if (!mod) return 0; - - return __jump_label_text_reserved(mod->jump_entries, + ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end); + + module_put(mod); + + return ret; } static void __jump_label_mod_update(struct static_key *key) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 9147ff6a12e5..3994a217bde7 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include <linux/atomic.h> #include <linux/bug.h> #include <linux/delay.h> @@ -98,6 +100,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); +/* For kcsan_prandom_u32_max(). */ +static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state); + static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, bool expect_write, @@ -223,7 +228,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && - IS_ALIGNED((unsigned long)ptr, size)) + !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ if (ctx->atomic_next > 0) { @@ -269,11 +274,28 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx * return true; } +/* + * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max() + * for more details. + * + * The open-coded version here is using only safe primitives for all contexts + * where we can have KCSAN instrumentation. In particular, we cannot use + * prandom_u32() directly, as its tracepoint could cause recursion. + */ +static u32 kcsan_prandom_u32_max(u32 ep_ro) +{ + struct rnd_state *state = &get_cpu_var(kcsan_rand_state); + const u32 res = prandom_u32_state(state); + + put_cpu_var(kcsan_rand_state); + return (u32)(((u64) res * ep_ro) >> 32); +} + static inline void reset_kcsan_skip(void) { long skip_count = kcsan_skip_watch - (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? - prandom_u32_max(kcsan_skip_watch) : + kcsan_prandom_u32_max(kcsan_skip_watch) : 0); this_cpu_write(kcsan_skip, skip_count); } @@ -283,12 +305,18 @@ static __always_inline bool kcsan_is_enabled(void) return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } -static inline unsigned int get_delay(void) +/* Introduce delay depending on context and configuration. */ +static void delay_access(int type) { unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; - return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? - prandom_u32_max(delay) : - 0); + /* For certain access types, skew the random delay to be longer. */ + unsigned int skew_delay_order = + (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0; + + delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + kcsan_prandom_u32_max(delay >> skew_delay_order) : + 0; + udelay(delay); } void kcsan_save_irqtrace(struct task_struct *task) @@ -361,13 +389,13 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * already removed the watchpoint, or another thread consumed * the watchpoint before this thread. */ - kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]); } if ((type & KCSAN_ACCESS_ASSERT) != 0) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); else - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]); user_access_restore(flags); } @@ -408,7 +436,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; if (!check_encodable((unsigned long)ptr, size)) { - kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]); goto out; } @@ -428,12 +456,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ - kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]); goto out_unlock; } - kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); - kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); /* * Read the current value, to later check and infer a race if the data @@ -459,7 +487,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { kcsan_disable_current(); - pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", is_write ? "write" : "read", size, ptr, watchpoint_slot((unsigned long)ptr), encode_watchpoint((unsigned long)ptr, size, is_write)); @@ -470,7 +498,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Delay this thread, to increase probability of observing a racy * conflicting access. */ - udelay(get_delay()); + delay_access(type); /* * Re-read value, and check if it is as expected; if not, we infer a @@ -535,16 +563,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * increment this counter. */ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, watchpoint - watchpoints); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ - kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]); if (is_assert) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, @@ -557,7 +585,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * reused after this point. */ remove_watchpoint(watchpoint); - kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); @@ -614,13 +642,16 @@ void __init kcsan_init(void) BUG_ON(!in_task()); kcsan_debugfs_init(); + prandom_seed_full_state(&kcsan_rand_state); /* * We are in the init task, and no other tasks should be running; * WRITE_ONCE without memory barrier is sufficient. */ - if (kcsan_early_enable) + if (kcsan_early_enable) { + pr_info("enabled early\n"); WRITE_ONCE(kcsan_enabled, true); + } } /* === Exported interface =================================================== */ @@ -793,7 +824,17 @@ EXPORT_SYMBOL(__kcsan_check_access); EXPORT_SYMBOL(__tsan_write##size); \ void __tsan_unaligned_write##size(void *ptr) \ __alias(__tsan_write##size); \ - EXPORT_SYMBOL(__tsan_unaligned_write##size) + EXPORT_SYMBOL(__tsan_unaligned_write##size); \ + void __tsan_read_write##size(void *ptr); \ + void __tsan_read_write##size(void *ptr) \ + { \ + check_access(ptr, size, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_read_write##size); \ + void __tsan_unaligned_read_write##size(void *ptr) \ + __alias(__tsan_read_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read_write##size) DEFINE_TSAN_READ_WRITE(1); DEFINE_TSAN_READ_WRITE(2); @@ -879,3 +920,130 @@ void __tsan_init(void) { } EXPORT_SYMBOL(__tsan_init); + +/* + * Instrumentation for atomic builtins (__atomic_*, __sync_*). + * + * Normal kernel code _should not_ be using them directly, but some + * architectures may implement some or all atomics using the compilers' + * builtins. + * + * Note: If an architecture decides to fully implement atomics using the + * builtins, because they are implicitly instrumented by KCSAN (and KASAN, + * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via + * atomic-instrumented) is no longer necessary. + * + * TSAN instrumentation replaces atomic accesses with calls to any of the below + * functions, whose job is to also execute the operation itself. + */ + +#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + } \ + return __atomic_load_n(ptr, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_load); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + } \ + __atomic_store_n(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_store) + +#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ + return __atomic_##op##suffix(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_##op) + +/* + * Note: CAS operations are always classified as write, even in case they + * fail. We cannot perform check_access() after a write, as it might lead to + * false positives, in cases such as: + * + * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...) + * + * T1: if (__atomic_load_n(&p->flag, ...)) { + * modify *p; + * p->flag = 0; + * } + * + * The only downside is that, if there are 3 threads, with one CAS that + * succeeds, another CAS that fails, and an unmarked racing operation, we may + * point at the wrong CAS as the source of the race. However, if we assume that + * all CAS can succeed in some other execution, the data race is still valid. + */ +#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo); \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ + return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) + +#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo); \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo) \ + { \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ + __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ + return exp; \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val) + +#define DEFINE_TSAN_ATOMIC_OPS(bits) \ + DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \ + DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \ + DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) + +DEFINE_TSAN_ATOMIC_OPS(8); +DEFINE_TSAN_ATOMIC_OPS(16); +DEFINE_TSAN_ATOMIC_OPS(32); +DEFINE_TSAN_ATOMIC_OPS(64); + +void __tsan_atomic_thread_fence(int memorder); +void __tsan_atomic_thread_fence(int memorder) +{ + __atomic_thread_fence(memorder); +} +EXPORT_SYMBOL(__tsan_atomic_thread_fence); + +void __tsan_atomic_signal_fence(int memorder); +void __tsan_atomic_signal_fence(int memorder) { } +EXPORT_SYMBOL(__tsan_atomic_signal_fence); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 023e49c58d55..3c8093a371b1 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include <linux/atomic.h> #include <linux/bsearch.h> #include <linux/bug.h> @@ -15,10 +17,19 @@ #include "kcsan.h" -/* - * Statistics counters. - */ -static atomic_long_t counters[KCSAN_COUNTER_COUNT]; +atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints", + [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints", + [KCSAN_COUNTER_DATA_RACES] = "data_races", + [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures", + [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity", + [KCSAN_COUNTER_REPORT_RACES] = "report_races", + [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin", + [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses", + [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives", +}; +static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT); /* * Addresses for filtering functions from reporting. This list can be used as a @@ -39,34 +50,6 @@ static struct { }; static DEFINE_SPINLOCK(report_filterlist_lock); -static const char *counter_to_name(enum kcsan_counter_id id) -{ - switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: return "data_races"; - case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; - case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; - case KCSAN_COUNTER_COUNT: - BUG(); - } - return NULL; -} - -void kcsan_counter_inc(enum kcsan_counter_id id) -{ - atomic_long_inc(&counters[id]); -} - -void kcsan_counter_dec(enum kcsan_counter_id id) -{ - atomic_long_dec(&counters[id]); -} - /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the @@ -86,7 +69,7 @@ static noinline void microbenchmark(unsigned long iters) */ WRITE_ONCE(kcsan_enabled, false); - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("%s begin | iters: %lu\n", __func__, iters); cycles = get_cycles(); while (iters--) { @@ -97,73 +80,13 @@ static noinline void microbenchmark(unsigned long iters) } cycles = get_cycles() - cycles; - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + pr_info("%s end | cycles: %llu\n", __func__, cycles); WRITE_ONCE(kcsan_enabled, was_enabled); /* restore context */ current->kcsan_ctx = ctx_save; } -/* - * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's - * debugfs file from multiple tasks to generate real conflicts and show reports. - */ -static long test_dummy; -static long test_flags; -static long test_scoped; -static noinline void test_thread(unsigned long iters) -{ - const long CHANGE_BITS = 0xff00ff00ff00ff00L; - const struct kcsan_ctx ctx_save = current->kcsan_ctx; - cycles_t cycles; - - /* We may have been called from an atomic region; reset context. */ - memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); - - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", - &test_dummy, &test_flags, &test_scoped); - - cycles = get_cycles(); - while (iters--) { - /* These all should generate reports. */ - __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - ASSERT_EXCLUSIVE_WRITER(test_dummy); - ASSERT_EXCLUSIVE_ACCESS(test_dummy); - - ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - /* not actually instrumented */ - WRITE_ONCE(test_dummy, iters); /* to observe value-change */ - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); - - test_flags ^= CHANGE_BITS; /* generate value-change */ - __kcsan_check_write(&test_flags, sizeof(test_flags)); - - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - { - /* Should generate reports anywhere in this block. */ - ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); - ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); - BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); - /* Unrelated accesses. */ - __kcsan_check_access(&cycles, sizeof(cycles), 0); - __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); - } - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - } - cycles = get_cycles() - cycles; - - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); - - /* restore context */ - current->kcsan_ctx = ctx_save; -} - static int cmp_filterlist_addrs(const void *rhs, const void *lhs) { const unsigned long a = *(const unsigned long *)rhs; @@ -220,7 +143,7 @@ static ssize_t insert_report_filterlist(const char *func) ssize_t ret = 0; if (!addr) { - pr_err("KCSAN: could not find function: '%s'\n", func); + pr_err("could not find function: '%s'\n", func); return -ENOENT; } @@ -270,9 +193,10 @@ static int show_info(struct seq_file *file, void *v) /* show stats */ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); - for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) - seq_printf(file, "%s: %ld\n", counter_to_name(i), - atomic_long_read(&counters[i])); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) { + seq_printf(file, "%s: %ld\n", counter_names[i], + atomic_long_read(&kcsan_counters[i])); + } /* show filter functions, and filter type */ spin_lock_irqsave(&report_filterlist_lock, flags); @@ -307,18 +231,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o WRITE_ONCE(kcsan_enabled, true); } else if (!strcmp(arg, "off")) { WRITE_ONCE(kcsan_enabled, false); - } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + } else if (str_has_prefix(arg, "microbench=")) { unsigned long iters; - if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + if (kstrtoul(&arg[strlen("microbench=")], 0, &iters)) return -EINVAL; microbenchmark(iters); - } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { - unsigned long iters; - - if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) - return -EINVAL; - test_thread(iters); } else if (!strcmp(arg, "whitelist")) { set_report_filterlist_whitelist(true); } else if (!strcmp(arg, "blacklist")) { diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index fed6fcb5768c..ebe7fd245104 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -27,6 +27,12 @@ #include <linux/types.h> #include <trace/events/printk.h> +#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE +#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE) +#else +#define __KCSAN_ACCESS_RW(alt) (alt) +#endif + /* Points to current test-case memory access "kernels". */ static void (*access_kernels[2])(void); @@ -186,20 +192,21 @@ static bool report_matches(const struct expect_report *r) /* Access 1 & 2 */ for (i = 0; i < 2; ++i) { + const int ty = r->access[i].type; const char *const access_type = - (r->access[i].type & KCSAN_ACCESS_ASSERT) ? - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "assert no accesses" : - "assert no writes") : - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "write" : - "read"); + (ty & KCSAN_ACCESS_ASSERT) ? + ((ty & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((ty & KCSAN_ACCESS_WRITE) ? + ((ty & KCSAN_ACCESS_COMPOUND) ? + "read-write" : + "write") : + "read"); const char *const access_type_aux = - (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? - " (marked)" : - ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? - " (scoped)" : - ""); + (ty & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : ""); if (i == 1) { /* Access 2 */ @@ -277,6 +284,12 @@ static noinline void test_kernel_write_atomic(void) WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); } +static noinline void test_kernel_atomic_rmw(void) +{ + /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */ + __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED); +} + __no_kcsan static noinline void test_kernel_write_uninstrumented(void) { test_var++; } @@ -390,6 +403,15 @@ static noinline void test_kernel_seqlock_writer(void) write_sequnlock_irqrestore(&test_seqlock, flags); } +static noinline void test_kernel_atomic_builtins(void) +{ + /* + * Generate concurrent accesses, expecting no reports, ensuring KCSAN + * treats builtin atomics as actually atomic. + */ + __atomic_load_n(&test_var, __ATOMIC_RELAXED); +} + /* ===== Test cases ===== */ /* Simple test with normal data race. */ @@ -430,8 +452,8 @@ static void test_concurrent_races(struct kunit *test) const struct expect_report expect = { .access = { /* NULL will match any address. */ - { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, - { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) }, }, }; static const struct expect_report never = { @@ -620,6 +642,29 @@ static void test_read_plain_atomic_write(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect); } +/* Test that atomic RMWs generate correct report. */ +__no_kcsan +static void test_read_plain_atomic_rmw(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_atomic_rmw, &test_var, sizeof(test_var), + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_atomic_rmw); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + /* Zero-sized accesses should never cause data race reports. */ __no_kcsan static void test_zero_size_access(struct kunit *test) @@ -853,6 +898,59 @@ static void test_seqlock_noreport(struct kunit *test) } /* + * Test atomic builtins work and required instrumentation functions exist. We + * also test that KCSAN understands they're atomic by racing with them via + * test_kernel_atomic_builtins(), and expect no reports. + * + * The atomic builtins _SHOULD NOT_ be used in normal kernel code! + */ +static void test_atomic_builtins(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins); + do { + long tmp; + + kcsan_enable_current(); + + __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED); + KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED)); + + KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 20L, test_var); + + tmp = 20L; + KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L, + 0, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 20L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L, + 1, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 30L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, -2L, test_var); + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + kcsan_disable_current(); + + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* * Each test case is run with different numbers of threads. Until KUnit supports * passing arguments for each test case, we encode #threads in the test case * name (read by get_num_threads()). [The '-' was chosen as a stylistic @@ -880,6 +978,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_write_write_struct_part), KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw), KCSAN_KUNIT_CASE(test_zero_size_access), KCSAN_KUNIT_CASE(test_data_race), KCSAN_KUNIT_CASE(test_assert_exclusive_writer), @@ -891,6 +990,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), + KCSAN_KUNIT_CASE(test_atomic_builtins), {}, }; diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 29480010dc30..8d4bf3431b3c 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -8,6 +8,7 @@ #ifndef _KERNEL_KCSAN_KCSAN_H #define _KERNEL_KCSAN_KCSAN_H +#include <linux/atomic.h> #include <linux/kcsan.h> #include <linux/sched.h> @@ -34,6 +35,10 @@ void kcsan_restore_irqtrace(struct task_struct *task); */ void kcsan_debugfs_init(void); +/* + * Statistics counters displayed via debugfs; should only be modified in + * slow-paths. + */ enum kcsan_counter_id { /* * Number of watchpoints currently in use. @@ -86,12 +91,7 @@ enum kcsan_counter_id { KCSAN_COUNTER_COUNT, /* number of counters */ }; - -/* - * Increment/decrement counter with given id; avoid calling these in fast-path. - */ -extern void kcsan_counter_inc(enum kcsan_counter_id id); -extern void kcsan_counter_dec(enum kcsan_counter_id id); +extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; /* * Returns true if data races in the function symbol that maps to func_addr diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 9d07e175de0f..d3bf87e6007c 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -228,6 +228,10 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE: + return "read-write"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "read-write (marked)"; case KCSAN_ACCESS_SCOPED: return "read (scoped)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: @@ -275,8 +279,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries cur = strnstr(buf, "kcsan_", len); if (cur) { - cur += sizeof("kcsan_") - 1; - if (strncmp(cur, "test", sizeof("test") - 1)) + cur += strlen("kcsan_"); + if (!str_has_prefix(cur, "test")) continue; /* KCSAN runtime function. */ /* KCSAN related test. */ } @@ -555,7 +559,7 @@ static bool prepare_report_consumer(unsigned long *flags, * If the actual accesses to not match, this was a false * positive due to watchpoint encoding. */ - kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]); goto discard; } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index d26a052d3383..d98bc208d06d 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/printk.h> @@ -116,16 +118,16 @@ static int __init kcsan_selftest(void) if (do_test()) \ ++passed; \ else \ - pr_err("KCSAN selftest: " #do_test " failed"); \ + pr_err("selftest: " #do_test " failed"); \ } while (0) RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); - pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + pr_info("selftest: %d/%d tests passed\n", passed, total); if (passed != total) - panic("KCSAN selftests failed"); + panic("selftests failed"); return 0; } postcore_initcall(kcsan_selftest); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..789002da7766 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,6 +36,7 @@ #include <linux/cpu.h> #include <linux/jump_label.h> #include <linux/perf_event.h> +#include <linux/static_call.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -1223,8 +1224,7 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -void recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) +static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = ri->rp; @@ -1236,12 +1236,11 @@ void recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock(&rp->lock); } else - /* Unregistering */ - hlist_add_head(&ri->hlist, head); + kfree_rcu(ri, rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); -void kretprobe_hash_lock(struct task_struct *tsk, +static void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { @@ -1263,7 +1262,7 @@ __acquires(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_lock); -void kretprobe_hash_unlock(struct task_struct *tsk, +static void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) __releases(hlist_lock) { @@ -1284,7 +1283,7 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); -struct kprobe kprobe_busy = { +static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, }; @@ -1313,7 +1312,7 @@ void kprobe_busy_end(void) void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; - struct hlist_head *head, empty_rp; + struct hlist_head *head; struct hlist_node *tmp; unsigned long hash, flags = 0; @@ -1323,19 +1322,14 @@ void kprobe_flush_task(struct task_struct *tk) kprobe_busy_begin(); - INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) - recycle_rp_inst(ri, &empty_rp); + recycle_rp_inst(ri); } kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } kprobe_busy_end(); } @@ -1359,7 +1353,8 @@ static void cleanup_rp_inst(struct kretprobe *rp) struct hlist_node *next; struct hlist_head *head; - /* No race here */ + /* To avoid recursive kretprobe by NMI, set kprobe busy here */ + kprobe_busy_begin(); for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; @@ -1369,6 +1364,8 @@ static void cleanup_rp_inst(struct kretprobe *rp) } kretprobe_table_unlock(hash, &flags); } + kprobe_busy_end(); + free_rp_inst(rp); } NOKPROBE_SYMBOL(cleanup_rp_inst); @@ -1634,6 +1631,7 @@ static int check_kprobe_address_safe(struct kprobe *p, if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || + static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; @@ -1927,6 +1925,97 @@ unsigned long __weak arch_deref_entry_point(void *entry) } #ifdef CONFIG_KRETPROBES + +unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, + void *trampoline_address, + void *frame_pointer) +{ + struct kretprobe_instance *ri = NULL, *last = NULL; + struct hlist_head *head; + struct hlist_node *tmp; + unsigned long flags; + kprobe_opcode_t *correct_ret_addr = NULL; + bool skipped = false; + + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more than one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry(ri, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be popped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } + + correct_ret_addr = ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); + + if (correct_ret_addr != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address)); + last = ri; + + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + if (ri->fp != frame_pointer) + continue; + + if (ri->rp && ri->rp->handler) { + struct kprobe *prev = kprobe_running(); + + __this_cpu_write(current_kprobe, &ri->rp->kp); + ri->ret_addr = correct_ret_addr; + ri->rp->handler(ri, regs); + __this_cpu_write(current_kprobe, prev); + } + + recycle_rp_inst(ri); + + if (ri == last) + break; + } + + kretprobe_hash_unlock(current, &flags); + + return (unsigned long)correct_ret_addr; +} +NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) + /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. @@ -1937,17 +2026,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /* - * To avoid deadlocks, prohibit return probing in NMI contexts, - * just skip the probe and increase the (inexact) 'nmissed' - * statistical counter, so that the user is informed that - * something happened: - */ - if (unlikely(in_nmi())) { - rp->nmissed++; - return 0; - } - /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); @@ -2140,6 +2218,9 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + if (WARN_ON_ONCE(kprobe_gone(p))) + return; + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2159,9 +2240,10 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which - * is using ftrace. + * is using ftrace, because ftrace framework is still available at + * MODULE_STATE_GOING notification. */ - if (kprobe_ftrace(p)) + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); } @@ -2419,7 +2501,10 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry(p, head, hlist) + hlist_for_each_entry(p, head, hlist) { + if (kprobe_gone(p)) + continue; + if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2436,6 +2521,7 @@ static int kprobes_module_callback(struct notifier_block *nb, */ kill_kprobe(p); } + } } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); @@ -2452,6 +2538,28 @@ static struct notifier_block kprobe_module_nb = { extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; +void kprobe_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + struct hlist_head *head; + struct kprobe *p; + int i; + + mutex_lock(&kprobe_mutex); + + /* Kill all kprobes on initmem */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry(p, head, hlist) { + if (start <= (void *)p->addr && (void *)p->addr < end) + kill_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); +} + static int __init init_kprobes(void) { int i, err = 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 54b74fabf40c..3e99dfef8408 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,23 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +DEFINE_PER_CPU(unsigned int, lockdep_recursion); +EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); + +static inline bool lockdep_enabled(void) +{ + if (!debug_locks) + return false; + + if (raw_cpu_read(lockdep_recursion)) + return false; + + if (current->lockdep_recursion) + return false; + + return true; +} + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -93,7 +110,7 @@ static inline void lockdep_lock(void) arch_spin_lock(&__lock); __owner = current; - current->lockdep_recursion++; + __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) @@ -101,7 +118,7 @@ static inline void lockdep_unlock(void) if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - current->lockdep_recursion--; + __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); } @@ -372,6 +389,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE]; static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* + * the id of held_lock + */ +static inline u16 hlock_id(struct held_lock *hlock) +{ + BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16); + + return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); +} + +static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +{ + return hlock_id & (MAX_LOCKDEP_KEYS - 1); +} + +/* * The hash key of the lock dependency chains is a hash itself too: * it's a hash of all locks taken up to that lock, including that lock. * It's a 64-bit hash, because it's important for the keys to be @@ -393,10 +425,15 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +static __always_inline void lockdep_recursion_inc(void) +{ + __this_cpu_inc(lockdep_recursion); +} + static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) - current->lockdep_recursion = 0; + if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion))) + __this_cpu_write(lockdep_recursion, 0); } void lockdep_set_selftest_task(struct task_struct *task) @@ -585,6 +622,8 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USED_READ] = "INITIAL READ USE", + /* abused as string storage for verify_lock_unused() */ [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -1320,7 +1359,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, int distance, + unsigned long ip, u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -1334,6 +1373,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; + entry->dep = dep; entry->distance = distance; entry->trace = trace; /* @@ -1421,23 +1461,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) return (cq->rear - cq->front) & CQ_MASK; } -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) +static inline void mark_lock_accessed(struct lock_list *lock) { - unsigned long nr; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ +static inline void visit_lock_entry(struct lock_list *lock, + struct lock_list *parent) +{ lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1471,85 +1507,283 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Return values of a bfs search: + * + * BFS_E* indicates an error + * BFS_R* indicates a result (match or not) + * + * BFS_EINVALIDNODE: Find a invalid node in the graph. + * + * BFS_EQUEUEFULL: The queue is full while doing the bfs. + * + * BFS_RMATCH: Find the matched node in the graph, and put that node into + * *@target_entry. + * + * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry + * _unchanged_. + */ +enum bfs_result { + BFS_EINVALIDNODE = -2, + BFS_EQUEUEFULL = -1, + BFS_RMATCH = 0, + BFS_RNOMATCH = 1, +}; /* - * Forward- or backward-dependency search, used for both circular dependency - * checking and hardirq-unsafe/softirq-unsafe checking. + * bfs_result < 0 means error + */ +static inline bool bfs_error(enum bfs_result res) +{ + return res < 0; +} + +/* + * DEP_*_BIT in lock_list::dep + * + * For dependency @prev -> @next: + * + * SR: @prev is shared reader (->read != 0) and @next is recursive reader + * (->read == 2) + * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader + * SN: @prev is shared reader and @next is non-recursive locker (->read != 2) + * EN: @prev is exclusive locker and @next is non-recursive locker + * + * Note that we define the value of DEP_*_BITs so that: + * bit0 is prev->read == 0 + * bit1 is next->read != 2 */ -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int offset) +#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */ +#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */ +#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */ +#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */ + +#define DEP_SR_MASK (1U << (DEP_SR_BIT)) +#define DEP_ER_MASK (1U << (DEP_ER_BIT)) +#define DEP_SN_MASK (1U << (DEP_SN_BIT)) +#define DEP_EN_MASK (1U << (DEP_EN_BIT)) + +static inline unsigned int +__calc_dep_bit(struct held_lock *prev, struct held_lock *next) +{ + return (prev->read == 0) + ((next->read != 2) << 1); +} + +static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next) { + return 1U << __calc_dep_bit(prev, next); +} + +/* + * calculate the dep_bit for backwards edges. We care about whether @prev is + * shared and whether @next is recursive. + */ +static inline unsigned int +__calc_dep_bitb(struct held_lock *prev, struct held_lock *next) +{ + return (next->read != 2) + ((prev->read == 0) << 1); +} + +static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bitb(prev, next); +} + +/* + * Initialize a lock_list entry @lock belonging to @class as the root for a BFS + * search. + */ +static inline void __bfs_init_root(struct lock_list *lock, + struct lock_class *class) +{ + lock->class = class; + lock->parent = NULL; + lock->only_xr = 0; +} + +/* + * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the + * root for a BFS search. + * + * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure + * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)-> + * and -(S*)->. + */ +static inline void bfs_init_root(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read == 2); +} + +/* + * Similar to bfs_init_root() but initialize the root for backwards BFS. + * + * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure + * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not + * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->). + */ +static inline void bfs_init_rootb(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read != 0); +} + +static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) +{ + if (!lock || !lock->parent) + return NULL; + + return list_next_or_null_rcu(get_dep_list(lock->parent, offset), + &lock->entry, struct lock_list, entry); +} + +/* + * Breadth-First Search to find a strong path in the dependency graph. + * + * @source_entry: the source of the path we are searching for. + * @data: data used for the second parameter of @match function + * @match: match function for the search + * @target_entry: pointer to the target of a matched path + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + * + * We may have multiple edges (considering different kinds of dependencies, + * e.g. ER and SN) between two nodes in the dependency graph. But + * only the strong dependency path in the graph is relevant to deadlocks. A + * strong dependency path is a dependency path that doesn't have two adjacent + * dependencies as -(*R)-> -(S*)->, please see: + * + * Documentation/locking/lockdep-design.rst + * + * for more explanation of the definition of strong dependency paths + * + * In __bfs(), we only traverse in the strong dependency path: + * + * In lock_list::only_xr, we record whether the previous dependency only + * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we + * filter out any -(S*)-> in the current dependency and after that, the + * ->only_xr is set according to whether we only have -(*R)-> left. + */ +static enum bfs_result __bfs(struct lock_list *source_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int offset) +{ + struct circular_queue *cq = &lock_cq; + struct lock_list *lock = NULL; struct lock_list *entry; - struct lock_list *lock; struct list_head *head; - struct circular_queue *cq = &lock_cq; - int ret = 1; + unsigned int cq_depth; + bool first; lockdep_assert_locked(); - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = 0; - goto exit; - } - - head = get_dep_list(source_entry, offset); - if (list_empty(head)) - goto exit; - __cq_init(cq); __cq_enqueue(cq, source_entry); - while ((lock = __cq_dequeue(cq))) { + while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) { + if (!lock->class) + return BFS_EINVALIDNODE; + + /* + * Step 1: check whether we already finish on this one. + * + * If we have visited all the dependencies from this @lock to + * others (iow, if we have visited all lock_list entries in + * @lock->class->locks_{after,before}) we skip, otherwise go + * and visit all the dependencies in the list and mark this + * list accessed. + */ + if (lock_accessed(lock)) + continue; + else + mark_lock_accessed(lock); + + /* + * Step 2: check whether prev dependency and this form a strong + * dependency path. + */ + if (lock->parent) { /* Parent exists, check prev dependency */ + u8 dep = lock->dep; + bool prev_only_xr = lock->parent->only_xr; + + /* + * Mask out all -(S*)-> if we only have *R in previous + * step, because -(*R)-> -(S*)-> don't make up a strong + * dependency. + */ + if (prev_only_xr) + dep &= ~(DEP_SR_MASK | DEP_SN_MASK); - if (!lock->class) { - ret = -2; - goto exit; + /* If nothing left, we skip */ + if (!dep) + continue; + + /* If there are only -(*R)-> left, set that for the next step */ + lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); } - head = get_dep_list(lock, offset); + /* + * Step 3: we haven't visited this and there is a strong + * dependency path to this, so check with @match. + */ + if (match(lock, data)) { + *target_entry = lock; + return BFS_RMATCH; + } + /* + * Step 4: if not match, expand the path by adding the + * forward or backwards dependencis in the search + * + */ + first = true; + head = get_dep_list(lock, offset); list_for_each_entry_rcu(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = 0; - goto exit; - } + visit_lock_entry(entry, lock); - if (__cq_enqueue(cq, entry)) { - ret = -1; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; - } + /* + * Note we only enqueue the first of the list into the + * queue, because we can always find a sibling + * dependency from one (see __bfs_next()), as a result + * the space of queue is saved. + */ + if (!first) + continue; + + first = false; + + if (__cq_enqueue(cq, entry)) + return BFS_EQUEUEFULL; + + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } -exit: - return ret; + + return BFS_RNOMATCH; } -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_forwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_after)); } -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_backwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_before)); @@ -1659,15 +1893,72 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline int class_equal(struct lock_list *entry, void *data) +/* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* + * We are about to add B -> A into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong + * dependency cycle, that means: + * + * Either + * + * a) B -> A is -(E*)-> + * + * or + * + * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B) + * + * as then we don't have -(*R)-> -(S*)-> in the cycle. + */ +static inline bool hlock_conflict(struct lock_list *entry, void *data) { - return entry->class == data; + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 0 || /* B -> A is -(E*)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } static noinline void print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1714,10 +2005,10 @@ static noinline void print_bfs_bug(int ret) WARN(1, "lockdep bfs error:%d\n", ret); } -static int noop_count(struct lock_list *entry, void *data) +static bool noop_count(struct lock_list *entry, void *data) { (*(unsigned long *)data)++; - return 0; + return false; } static unsigned long __lockdep_count_forward_deps(struct lock_list *this) @@ -1734,8 +2025,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1761,8 +2051,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1775,18 +2064,18 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) /* * Check that the dependency graph starting at <src> can lead to - * <target> or not. Print an error and return 0 if it does. + * <target> or not. */ -static noinline int -check_path(struct lock_class *target, struct lock_list *src_entry, +static noinline enum bfs_result +check_path(struct held_lock *target, struct lock_list *src_entry, + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - int ret; + enum bfs_result ret; - ret = __bfs_forwards(src_entry, (void *)target, class_equal, - target_entry); + ret = __bfs_forwards(src_entry, target, match, target_entry); - if (unlikely(ret < 0)) + if (unlikely(bfs_error(ret))) print_bfs_bug(ret); return ret; @@ -1797,24 +2086,23 @@ check_path(struct lock_class *target, struct lock_list *src_entry, * lead to <target>. If it can, there is a circle when adding * <target> -> <src> dependency. * - * Print an error and return 0 if it does. + * Print an error and return BFS_RMATCH if it does. */ -static noinline int +static noinline enum bfs_result check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_cyclic_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, &target_entry); - if (unlikely(!ret)) { + if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { /* * If save_trace fails here, the printing might @@ -1836,27 +2124,35 @@ check_noncircular(struct held_lock *src, struct held_lock *target, * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. * - * Print an error and return 2 if it does or 1 if it does not. + * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. */ -static noinline int +static noinline enum bfs_result check_redundant(struct held_lock *src, struct held_lock *target) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than <src> -> <target>. So if <src> is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if <prev> is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; debug_atomic_inc(nr_redundant_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_equal, &target_entry); - if (!ret) { + if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); - ret = 2; - } else if (ret < 0) - ret = 0; return ret; } @@ -1864,39 +2160,86 @@ check_redundant(struct held_lock *src, struct held_lock *target) #ifdef CONFIG_TRACE_IRQFLAGS -static inline int usage_accumulate(struct lock_list *entry, void *mask) -{ - *(unsigned long *)mask |= entry->class->usage_mask; - - return 0; -} - /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * + * A irq safe->unsafe deadlock happens with the following conditions: + * + * 1) We have a strong dependency path A -> ... -> B + * + * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore + * irq can create a new dependency B -> A (consider the case that a holder + * of B gets interrupted by an irq whose handler will try to acquire A). + * + * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a + * strong circle: + * + * For the usage bits of B: + * a) if A -> B is -(*N)->, then B -> A could be any type, so any + * ENABLED_IRQ usage suffices. + * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only + * ENABLED_IRQ_*_READ usage suffices. + * + * For the usage bits of A: + * c) if A -> B is -(E*)->, then B -> A could be any type, so any + * USED_IN_IRQ usage suffices. + * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only + * USED_IN_IRQ_*_READ usage suffices. */ -static inline int usage_match(struct lock_list *entry, void *mask) +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of A should be accumulated to detect + * safe->unsafe bugs. + * + * Note that usage_accumulate() is used in backwards search, so ->only_xr + * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true). + * + * As above, if only_xr is false, which means A -> B has -(E*)-> dependency + * path, any usage of A should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_accumulate(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & *(unsigned long *)mask; + if (!entry->only_xr) + *(unsigned long *)mask |= entry->class->usage_mask; + else /* Mask out _READ usage bits */ + *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ); + + return false; +} + +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of B conflicts with the usage bits of A, + * i.e. which usage bit of B may introduce safe->unsafe deadlocks. + * + * As above, if only_xr is false, which means A -> B has -(*N)-> dependency + * path, any usage of B should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_match(struct lock_list *entry, void *mask) +{ + if (!entry->only_xr) + return !!(entry->class->usage_mask & *(unsigned long *)mask); + else /* Mask out _READ usage bits */ + return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); } /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. * - * Return 0 if such a node exists in the subgraph, and put that node + * Return BFS_MATCH if such a node exists in the subgraph, and put that node * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_forwards_checks); @@ -1908,18 +2251,12 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, /* * Find a node in the backwards-direction dependency sub-graph starting * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_backwards_checks); @@ -1939,7 +2276,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) #endif printk(KERN_CONT " {\n"); - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + for (bit = 0; bit < LOCK_TRACE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; @@ -2179,17 +2516,39 @@ static unsigned long invert_dir_mask(unsigned long mask) } /* - * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all - * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). - * And then mask out all bitnr0. + * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ + * usage may cause deadlock too, for example: + * + * P1 P2 + * <irq disabled> + * write_lock(l1); <irq enabled> + * read_lock(l2); + * write_lock(l2); + * <in irq> + * read_lock(l1); + * + * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2 + * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible + * deadlock. + * + * In fact, all of the following cases may cause deadlocks: + * + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ + * + * As a result, to calculate the "exclusive mask", first we invert the + * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with + * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all + * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*). */ static unsigned long exclusive_mask(unsigned long mask) { unsigned long excl = invert_dir_mask(mask); - /* Strip read */ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; - excl &= ~LOCKF_IRQ_READ; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; } @@ -2206,6 +2565,7 @@ static unsigned long original_mask(unsigned long mask) unsigned long excl = invert_dir_mask(mask); /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; @@ -2220,14 +2580,24 @@ static int find_exclusive_match(unsigned long mask, enum lock_usage_bit *bitp, enum lock_usage_bit *excl_bitp) { - int bit, excl; + int bit, excl, excl_read; for_each_set_bit(bit, &mask, LOCK_USED) { + /* + * exclusive_bit() strips the read bit, however, + * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need + * to search excl | LOCK_USAGE_READ_MASK as well. + */ excl = exclusive_bit(bit); + excl_read = excl | LOCK_USAGE_READ_MASK; if (excl_mask & lock_flag(excl)) { *bitp = bit; *excl_bitp = excl; return 0; + } else if (excl_mask & lock_flag(excl_read)) { + *bitp = bit; + *excl_bitp = excl_read; + return 0; } } return -1; @@ -2247,17 +2617,16 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, struct lock_list *target_entry1; struct lock_list *target_entry; struct lock_list this, that; - int ret; + enum bfs_result ret; /* * Step 1: gather all hard/soft IRQs usages backward in an * accumulated usage mask. */ - this.parent = NULL; - this.class = hlock_class(prev); + bfs_init_rootb(&this, prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } @@ -2272,16 +2641,15 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ forward_mask = exclusive_mask(usage_mask); - that.parent = NULL; - that.class = hlock_class(next); + bfs_init_root(&that, next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; /* * Step 3: we found a bad match! Now retrieve a lock from the backward @@ -2291,11 +2659,11 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (DEBUG_LOCKS_WARN_ON(ret == 1)) + if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH)) return 1; /* @@ -2459,11 +2827,11 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, + struct held_lock *next, u16 distance, struct lock_trace **const trace) { struct lock_list *entry; - int ret; + enum bfs_result ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { /* @@ -2494,23 +2862,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * in the graph whose neighbours are to be checked. */ ret = check_noncircular(next, prev, trace); - if (unlikely(ret <= 0)) + if (unlikely(bfs_error(ret) || ret == BFS_RMATCH)) return 0; if (!check_irq_usage(curr, prev, next)) return 0; /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; - /* * Is the <prev> -> <next> dependency already present? * * (this may occur even though this is a new chain: consider @@ -2522,7 +2880,35 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 1; + entry->dep |= calc_dep(prev, next); + + /* + * Also, update the reverse dependency in @next's + * ->locks_before list. + * + * Here we reuse @entry as the cursor, which is fine + * because we won't go to the next iteration of the + * outer loop: + * + * For normal cases, we return in the inner loop. + * + * If we fail to return, we have inconsistency, i.e. + * <prev>::locks_after contains <next> while + * <next>::locks_before doesn't contain <prev>. In + * that case, we return after the inner and indicate + * something is wrong. + */ + list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) { + if (entry->class == hlock_class(prev)) { + if (distance == 1) + entry->distance = 1; + entry->dep |= calc_depb(prev, next); + return 1; + } + } + + /* <prev> is not found in <next>::locks_before */ + return 0; } } @@ -2531,8 +2917,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Is the <prev> -> <next> link redundant? */ ret = check_redundant(prev, next); - if (ret != 1) - return ret; + if (bfs_error(ret)) + return 0; + else if (ret == BFS_RMATCH) + return 2; #endif if (!*trace) { @@ -2547,14 +2935,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_dep(prev, next), + *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_depb(prev, next), + *trace); if (!ret) return 0; @@ -2590,16 +2982,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { - int distance = curr->lockdep_depth - depth + 1; + u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, - &trace); + if (hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace); if (!ret) return 0; @@ -2875,7 +3262,10 @@ static inline void free_chain_hlocks(int base, int size) struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { - return lock_classes + chain_hlocks[chain->base + i]; + u16 chain_hlock = chain_hlocks[chain->base + i]; + unsigned int class_idx = chain_hlock_class_idx(chain_hlock); + + return lock_classes + class_idx - 1; } /* @@ -2901,12 +3291,12 @@ static inline int get_first_held_lock(struct task_struct *curr, /* * Returns the next chain_key iteration */ -static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key) { - u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + u64 new_chain_key = iterate_chain_key(chain_key, hlock_id); - printk(" class_idx:%d -> chain_key:%016Lx", - class_idx, + printk(" hlock_id:%d -> chain_key:%016Lx", + (unsigned int)hlock_id, (unsigned long long)new_chain_key); return new_chain_key; } @@ -2923,12 +3313,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne hlock_next->irq_context); for (; i < depth; i++) { hlock = curr->held_locks + i; - chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key); print_lock(hlock); } - print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_chain_key_iteration(hlock_id(hlock_next), chain_key); print_lock(hlock_next); } @@ -2936,14 +3326,14 @@ static void print_chain_keys_chain(struct lock_chain *chain) { int i; u64 chain_key = INITIAL_CHAIN_KEY; - int class_id; + u16 hlock_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { - class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id, chain_key); + hlock_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + class_id); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); printk("\n"); } } @@ -2992,7 +3382,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx; + id = hlock_id(&curr->held_locks[i]); if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -3041,7 +3431,6 @@ static inline int add_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; @@ -3084,11 +3473,11 @@ static inline int add_chain_cache(struct task_struct *curr, chain->base = j; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; + int lock_id = hlock_id(curr->held_locks + i); chain_hlocks[chain->base + j] = lock_id; } - chain_hlocks[chain->base + j] = class - lock_classes; + chain_hlocks[chain->base + j] = hlock_id(hlock); hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(chain->irq_context); @@ -3275,7 +3664,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = INITIAL_CHAIN_KEY; - chain_key = iterate_chain_key(chain_key, hlock->class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -3434,24 +3823,32 @@ print_irq_inversion_bug(struct task_struct *curr, */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + bfs_init_root(&root, this); + ret = find_usage_forwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; + + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(read_bit)); + } - print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); return 0; } @@ -3461,24 +3858,32 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + bfs_init_rootb(&root, this); + ret = find_usage_backwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; + + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(read_bit)); + } - print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); return 0; } @@ -3517,8 +3922,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -#define STRICT_READ_CHECKS 1 - static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, @@ -3544,16 +3947,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int dir = new_bit & LOCK_USAGE_DIR_MASK; /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - - /* * Validate that this particular lock does not have conflicting * usage states. */ @@ -3561,23 +3954,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 0; /* - * Validate that the lock dependencies don't have conflicting usage - * states. + * Check for read in write conflicts */ - if ((!read || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) + if (!read && !valid_state(curr, this, new_bit, + excl_bit + LOCK_USAGE_READ_MASK)) return 0; + /* - * Check for read in write conflicts + * Validate that the lock dependencies don't have conflicting usage + * states. */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) + if (dir) { + /* + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + if (!check_usage_backwards(curr, this, excl_bit)) return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, - state_name(new_bit + LOCK_USAGE_READ_MASK))) + } else { + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + */ + if (!check_usage_forwards(curr, this, excl_bit)) return 0; } @@ -3657,7 +4057,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(__this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -3693,7 +4093,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) current->hardirq_chain_key = current->curr_chain_key; - current->lockdep_recursion++; + lockdep_recursion_inc(); __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } @@ -3726,7 +4126,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(__this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { @@ -3779,7 +4179,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (in_nmi()) { if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) return; - } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + } else if (__this_cpu_read(lockdep_recursion)) return; /* @@ -3812,7 +4212,7 @@ void lockdep_softirqs_on(unsigned long ip) { struct irqtrace_events *trace = ¤t->irqtrace; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3827,7 +4227,7 @@ void lockdep_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion++; + lockdep_recursion_inc(); /* * We'll do an OFF -> ON transition: */ @@ -3850,7 +4250,7 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3969,13 +4369,18 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; + unsigned int new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); return 0; } + if (new_bit == LOCK_USED && this->read) + new_bit = LOCK_USED_READ; + + new_mask = 1 << new_bit; + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3988,26 +4393,32 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didn't race: */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } + if (unlikely(hlock_class(this)->usage_mask & new_mask)) + goto unlock; hlock_class(this)->usage_mask |= new_mask; - if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) - return 0; + if (new_bit < LOCK_TRACE_STATES) { + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) + return 0; + } switch (new_bit) { + case 0 ... LOCK_USED-1: + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; + break; + case LOCK_USED: debug_atomic_dec(nr_unused_locks); break; + default: - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; + break; } +unlock: graph_unlock(); /* @@ -4220,11 +4631,11 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, if (subclass) { unsigned long flags; - if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); register_lock_class(lock, subclass, 1); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -4411,7 +4822,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { print_lock_nested_lock_not_held(curr, hlock, ip); @@ -4907,11 +5318,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); @@ -4924,11 +5335,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); @@ -4942,12 +5353,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock { #ifdef CONFIG_PROVE_LOCKING struct lock_class *class = look_up_lock_class(lock, subclass); + unsigned long mask = LOCKF_USED; /* if it doesn't have a class (yet), it certainly hasn't been used yet */ if (!class) return; - if (!(class->usage_mask & LOCK_USED)) + /* + * READ locks only conflict with USED, such that if we only ever use + * READ locks, there is no deadlock possible -- RCU. + */ + if (!hlock->read) + mask |= LOCKF_USED_READ; + + if (!(class->usage_mask & mask)) return; hlock->class_idx = class - lock_classes; @@ -4958,7 +5377,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock static bool lockdep_nmi(void) { - if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + if (raw_cpu_read(lockdep_recursion)) return false; if (!in_nmi()) @@ -4968,6 +5387,20 @@ static bool lockdep_nmi(void) } /* + * read_lock() is recursive if: + * 1. We force lockdep think this way in selftests or + * 2. The implementation is not queued read/write lock or + * 3. The locker is at an in_interrupt() context. + */ +bool read_lock_is_recursive(void) +{ + return force_read_lock_recursive || + !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || + in_interrupt(); +} +EXPORT_SYMBOL_GPL(read_lock_is_recursive); + +/* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ @@ -4979,7 +5412,10 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - if (unlikely(current->lockdep_recursion)) { + if (!debug_locks) + return; + + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { struct held_lock hlock; @@ -5002,7 +5438,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); lockdep_recursion_finish(); @@ -5016,13 +5452,13 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) trace_lock_release(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -5035,13 +5471,13 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) unsigned long flags; int ret = 0; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); ret = __lock_is_held(lock, read); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5056,13 +5492,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return cookie; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); cookie = __lock_pin_lock(lock); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5075,13 +5511,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_repin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5092,13 +5528,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_unpin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5228,15 +5664,12 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat || !debug_locks)) - return; - - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5249,15 +5682,12 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) trace_lock_contended(lock, ip); - if (unlikely(!lock_stat || !debug_locks)) - return; - - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquired(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5296,7 +5726,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, int i; for (i = chain->base; i < chain->base + chain->depth; i++) { - if (chain_hlocks[i] != class - lock_classes) + if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes) continue; /* * Each lock class occurs at most once in a lock chain so once diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index baca699b94e9..de49f9e1c11b 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -19,9 +19,13 @@ enum lock_usage_bit { #include "lockdep_states.h" #undef LOCKDEP_STATE LOCK_USED, - LOCK_USAGE_STATES + LOCK_USED_READ, + LOCK_USAGE_STATES, }; +/* states after LOCK_USED_READ are not traced and printed */ +static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES); + #define LOCK_USAGE_READ_MASK 1 #define LOCK_USAGE_DIR_MASK 2 #define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) @@ -40,6 +44,7 @@ enum { #include "lockdep_states.h" #undef LOCKDEP_STATE __LOCKF(USED) + __LOCKF(USED_READ) }; #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | @@ -119,7 +124,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) +#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203..70a32a576f3f 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); diff --git a/kernel/module.c b/kernel/module.c index 1c5cff34d9f2..c075a18103fb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3275,6 +3275,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(unsigned long), &mod->num_kprobe_blacklist); #endif +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + mod->static_call_sites = section_objs(info, ".static_call_sites", + sizeof(*mod->static_call_sites), + &mod->num_static_call_sites); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -3792,9 +3797,13 @@ static int prepare_coming_module(struct module *mod) if (err) return err; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - return 0; + err = blocking_notifier_call_chain_robust(&module_notify_list, + MODULE_STATE_COMING, MODULE_STATE_GOING, mod); + err = notifier_to_errno(err); + if (err) + klp_module_going(mod); + + return err; } static int unknown_module_param_cb(char *param, char *val, const char *modname, diff --git a/kernel/notifier.c b/kernel/notifier.c index 84c987dfbe03..1b019cbca594 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -94,6 +94,34 @@ static int notifier_call_chain(struct notifier_block **nl, } NOKPROBE_SYMBOL(notifier_call_chain); +/** + * notifier_call_chain_robust - Inform the registered notifiers about an event + * and rollback on error. + * @nl: Pointer to head of the blocking notifier chain + * @val_up: Value passed unmodified to the notifier function + * @val_down: Value passed unmodified to the notifier function when recovering + * from an error on @val_up + * @v Pointer passed unmodified to the notifier function + * + * NOTE: It is important the @nl chain doesn't change between the two + * invocations of notifier_call_chain() such that we visit the + * exact same notifier callbacks; this rules out any RCU usage. + * + * Returns: the return value of the @val_up call. + */ +static int notifier_call_chain_robust(struct notifier_block **nl, + unsigned long val_up, unsigned long val_down, + void *v) +{ + int ret, nr = 0; + + ret = notifier_call_chain(nl, val_up, v, -1, &nr); + if (ret & NOTIFY_STOP_MASK) + notifier_call_chain(nl, val_down, v, nr-1, NULL); + + return ret; +} + /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). @@ -144,13 +172,30 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); +int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + unsigned long flags; + int ret; + + /* + * Musn't use RCU; because then the notifier list can + * change between the up and down traversal. + */ + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); + spin_unlock_irqrestore(&nh->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); +NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); + /** - * __atomic_notifier_call_chain - Call functions in an atomic notifier chain + * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See the comment for notifier_call_chain. - * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. @@ -163,24 +208,16 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { int ret; rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); -NOKPROBE_SYMBOL(__atomic_notifier_call_chain); -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) -{ - return __atomic_notifier_call_chain(nh, val, v, -1, NULL); + return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); @@ -250,13 +287,30 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); +int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + int ret = NOTIFY_DONE; + + /* + * We check the head outside the lock, but if this access is + * racy then it does not matter what the result of the test + * is, we re-check the list after having taken the lock anyway: + */ + if (rcu_access_pointer(nh->head)) { + down_read(&nh->rwsem); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); + up_read(&nh->rwsem); + } + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); + /** - * __blocking_notifier_call_chain - Call functions in a blocking notifier chain + * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -268,9 +322,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) { int ret = NOTIFY_DONE; @@ -281,19 +334,11 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, - nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); - -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) -{ - return __blocking_notifier_call_chain(nh, val, v, -1, NULL); -} EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* @@ -335,13 +380,18 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); +int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + return notifier_call_chain_robust(&nh->head, val_up, val_down, v); +} +EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); + /** - * __raw_notifier_call_chain - Call functions in a raw notifier chain + * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. @@ -354,18 +404,10 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -} -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); - int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { - return __raw_notifier_call_chain(nh, val, v, -1, NULL); + return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); @@ -437,12 +479,10 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** - * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -454,25 +494,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); - -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) -{ - return __srcu_notifier_call_chain(nh, val, v, -1, NULL); -} EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** diff --git a/kernel/padata.c b/kernel/padata.c index 16cb894dc272..d4d3ba6e1728 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -215,12 +215,13 @@ int padata_do_parallel(struct padata_shell *ps, padata->pd = pd; padata->cb_cpu = *cb_cpu; - rcu_read_unlock_bh(); - spin_lock(&padata_works_lock); padata->seq_nr = ++pd->seq_nr; pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + + rcu_read_unlock_bh(); + if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e7aa57fb2fdc..1dee70815f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -706,8 +706,8 @@ static int load_image_and_restore(void) */ int hibernate(void) { - int error, nr_calls = 0; bool snapshot_test = false; + int error; if (!hibernation_available()) { pm_pr_dbg("Hibernation not available.\n"); @@ -723,11 +723,9 @@ int hibernate(void) pr_info("hibernation entry\n"); pm_prepare_console(); - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Exit; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto Restore; ksys_sync_helper(); @@ -785,7 +783,8 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_HIBERNATION); + Restore: pm_restore_console(); hibernate_release(); Unlock: @@ -804,7 +803,7 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { - int error, nr_calls = 0; + int error; lock_system_sleep(); @@ -815,11 +814,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) pm_prepare_console(); - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto exit; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto restore; error = freeze_processes(); if (error) @@ -880,8 +877,9 @@ thaw: thaw_processes(); exit: - __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_HIBERNATION); +restore: pm_restore_console(); hibernate_release(); @@ -910,7 +908,7 @@ EXPORT_SYMBOL_GPL(hibernate_quiet_exec); */ static int software_resume(void) { - int error, nr_calls = 0; + int error; /* * If the user said "noresume".. bail out early. @@ -997,11 +995,9 @@ static int software_resume(void) pr_info("resume from hibernation\n"); pm_prepare_console(); - error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Close_Finish; - } + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); + if (error) + goto Restore; pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); @@ -1017,7 +1013,8 @@ static int software_resume(void) error = load_image_and_restore(); thaw_processes(); Finish: - __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_RESTORE); + Restore: pm_restore_console(); pr_info("resume failed (%d)\n", error); hibernate_release(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 40f86ec4ab30..0aefd6f57e0a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -80,18 +80,18 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) +int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; - ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, - nr_to_call, nr_calls); + ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } + int pm_notifier_call_chain(unsigned long val) { - return __pm_notifier_call_chain(val, -1, NULL); + return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 32fc89ac96c3..24f12d534515 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -210,8 +210,7 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ -extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, - int *nr_calls); +extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8b1bb5ee7e5d..32391acc806b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -342,18 +342,16 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error, nr_calls = 0; + int error; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Finish; - } + error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND); + if (error) + goto Restore; trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -363,8 +361,8 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); - Finish: - __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_SUSPEND); + Restore: pm_restore_console(); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 01e2858b5fe3..71385bedcc3a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -335,26 +335,23 @@ static int swsusp_swap_check(void) { int res; - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &hib_resume_bdev); + if (swsusp_resume_device) + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + else + res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; - root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); - if (res) - return res; + + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, + NULL); + if (IS_ERR(hib_resume_bdev)) + return PTR_ERR(hib_resume_bdev); res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); - /* - * Update the resume device to the one actually used, - * so the test_resume mode can use it in case it is - * invoked from hibernate() to test the snapshot. - */ - swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } diff --git a/kernel/power/user.c b/kernel/power/user.c index d5eedc2baa2a..740723bb3885 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,18 +35,18 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; - struct inode *bd_inode; + dev_t dev; } snapshot_state; -int is_hibernate_resume_dev(const struct inode *bd_inode) +int is_hibernate_resume_dev(dev_t dev) { - return hibernation_available() && snapshot_state.bd_inode == bd_inode; + return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error, nr_calls = 0; + int error; if (!hibernation_available()) return -EPERM; @@ -69,13 +69,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; + data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) - __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to @@ -85,15 +82,11 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } else - nr_calls--; - - if (error) - __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); + } } if (error) hibernate_release(); @@ -101,7 +94,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; - data->bd_inode = NULL; + data->dev = 0; Unlock: unlock_system_sleep(); @@ -117,7 +110,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; - data->bd_inode = NULL; + data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -210,7 +203,6 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { - struct block_device *bdev; sector_t offset; dev_t swdev; @@ -237,16 +229,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data, * User space encodes device types as two-byte values, * so we need to recode them */ - if (!swdev) { - data->swap = -1; - return -EINVAL; - } - data->swap = swap_type_of(swdev, offset, &bdev); + data->swap = swap_type_of(swdev, offset); if (data->swap < 0) - return -ENODEV; - - data->bd_inode = bdev->bd_inode; - bdput(bdev); + return swdev ? -ENODEV : -EINVAL; + data->dev = swdev; return 0; } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index cf66a3ccd757..e01cba5e4b52 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -167,7 +167,7 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) # define STATE_RCU_HEAD_READY 0 # define STATE_RCU_HEAD_QUEUED 1 -extern struct debug_obj_descr rcuhead_debug_descr; +extern const struct debug_obj_descr rcuhead_debug_descr; static inline int debug_rcu_head_queue(struct rcu_head *head) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 835e2df8590a..05d3e1375e4c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -590,7 +590,7 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) } #else /* #ifdef CONFIG_TASKS_RCU */ -static void show_rcu_tasks_classic_gp_kthread(void) { } +static inline void show_rcu_tasks_classic_gp_kthread(void) { } void exit_tasks_rcu_start(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..f78ee759af9c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -673,6 +673,7 @@ void rcu_idle_enter(void) lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -886,6 +887,7 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2de49b5d8dd2..3e0f4bcb558f 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -469,7 +469,7 @@ void destroy_rcu_head_on_stack(struct rcu_head *head) } EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); -struct debug_obj_descr rcuhead_debug_descr = { +const struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", .is_static_object = rcuhead_is_static_object, }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d95dc3f4644..8160ab5263f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); @@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return clamp_value / UCLAMP_BUCKET_DELTA; } -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) @@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + unsigned int task_flags; + if (!tsk->state) return; + task_flags = tsk->flags; /* * If a worker went to sleep, notify and ask workqueue whether * it wants to wake up a task to maintain concurrency. @@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk) * in the possible wakeup of a kworker and because wq_worker_sleeping() * requires it. */ - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - if (tsk->flags & PF_WQ_WORKER) + if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3862a28cd05d..6d93f4518734 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + hrtimer_try_to_cancel(&p->dl.dl_timer); + p->dl.dl_throttled = 0; + } } else if (!dl_prio(p->normal_prio)) { /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. */ + p->dl.dl_throttled = 0; BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); return; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 36c54265bb2b..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +static int sd_ctl_doflags(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned long flags = *(unsigned long *)table->data; + size_t data_size = 0; + size_t len = 0; + char *tmp; + int idx; + + if (write) + return 0; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + /* Name plus whitespace */ + data_size += strlen(name) + 1; + } + + if (*ppos > data_size) { + *lenp = 0; + return 0; + } + + tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + } + + tmp += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + if (len < *lenp) { + ((char *)buffer)[len] = '\n'; + len++; + } + + *lenp = len; + *ppos += len; + + kfree(tmp); + + return 0; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { @@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a68a0536add..aa4c6227cd6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ @@ -1504,6 +1504,7 @@ enum numa_type { /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; + unsigned long runnable; unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; @@ -1547,19 +1548,22 @@ struct task_numa_env { }; static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); +static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum numa_type numa_classify(unsigned int imbalance_pct, struct numa_stats *ns) { if ((ns->nr_running > ns->weight) && - ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) return node_overloaded; if ((ns->nr_running < ns->weight) || - ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) return node_has_spare; return node_fully_busy; @@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env, struct rq *rq = cpu_rq(cpu); ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); ns->util += cpu_util(cpu); ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); @@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); @@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) - account_entity_enqueue(cfs_rq, se); + update_load_add(&cfs_rq->load, se->load.weight); } @@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; @@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } @@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * IOW we're enqueueing a task on a new CPU. */ attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); if (flags & UPDATE_TG) - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } } @@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = second; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ se = cfs_rq->next; + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + se = cfs_rq->last; + } clear_buddies(cfs_rq, se); @@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; @@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6274,7 +6279,7 @@ symmetric: if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, target); + i = select_idle_smt(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; + spare_cap = cpu_cap; + lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. @@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (unlikely(task_has_idle_policy(p))) return 0; + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + /* * Buddy candidates are cache hot: */ @@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + + if ((load >> env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; @@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) struct sched_entity *se; if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); if (cfs_rq == &rq->cfs) decayed = true; @@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -8957,7 +8969,7 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +static inline long adjust_numa_imbalance(int imbalance, int nr_running) { unsigned int imbalance_min; @@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) * tasks that remain local when the source domain is almost idle. */ imbalance_min = 2; - if (src_nr_running <= imbalance_min) + if (nr_running <= imbalance_min) return 0; return imbalance; @@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; @@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq) } EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); +int sched_trace_rq_cpu_capacity(struct rq *rq) +{ + return rq ? +#ifdef CONFIG_SMP + rq->cpu_capacity +#else + SCHED_CAPACITY_SCALE +#endif + : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); + const struct cpumask *sched_trace_rd_span(struct root_domain *rd) { #ifdef CONFIG_SMP diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f391..68d369cba9e4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(RT_RUNTIME_SHARE, false) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1bd7e3af904f..dd7770226086 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -25,10 +25,18 @@ static inline bool sched_debug(void) return sched_debug_enabled; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include <linux/sched/sd_flags.h> +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); @@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { @@ -137,22 +160,22 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include <linux/sched/sd_flags.h> +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (parent->groups == parent->groups->next) + pflags &= ~SD_DEGENERATE_GROUPS_MASK; + if (~cflags & pflags) return 0; @@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, @@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl, *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, + .busy_factor = 16, + .imbalance_pct = 117, .cache_nice_tries = 0, @@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; has_asym = true; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3ee59ce0a323..676d4af62103 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1109,13 +1109,18 @@ out: } #ifdef CONFIG_SECCOMP_FILTER -static int seccomp_notify_release(struct inode *inode, struct file *file) +static void seccomp_notify_free(struct seccomp_filter *filter) +{ + kfree(filter->notif); + filter->notif = NULL; +} + +static void seccomp_notify_detach(struct seccomp_filter *filter) { - struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; if (!filter) - return 0; + return; mutex_lock(&filter->notify_lock); @@ -1139,9 +1144,15 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) complete(&knotif->ready); } - kfree(filter->notif); - filter->notif = NULL; + seccomp_notify_free(filter); mutex_unlock(&filter->notify_lock); +} + +static int seccomp_notify_release(struct inode *inode, struct file *file) +{ + struct seccomp_filter *filter = file->private_data; + + seccomp_notify_detach(filter); __put_seccomp_filter(filter); return 0; } @@ -1488,7 +1499,7 @@ static struct file *init_listener(struct seccomp_filter *filter) out_notif: if (IS_ERR(ret)) - kfree(filter->notif); + seccomp_notify_free(filter); out: return ret; } @@ -1581,6 +1592,7 @@ out_put_fd: listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); + seccomp_notify_detach(prepared); } else { fd_install(listener, listener_f); ret = listener; diff --git a/kernel/softirq.c b/kernel/softirq.c index bf88d7f62433..09229ad82209 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -481,6 +481,7 @@ void raise_softirq(unsigned int nr) void __raise_softirq_irqoff(unsigned int nr) { + lockdep_assert_irqs_disabled(); trace_softirq_raise(nr); or_softirq_pending(1UL << nr); } diff --git a/kernel/stackleak.c b/kernel/stackleak.c index a8fc9ae1d03d..ce161a8e8d97 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -20,7 +20,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); int stack_erasing_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 946f44a9e86a..9f8117c7cfdd 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -78,8 +78,7 @@ struct stacktrace_cookie { unsigned int len; }; -static bool stack_trace_consume_entry(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry(void *cookie, unsigned long addr) { struct stacktrace_cookie *c = cookie; @@ -94,12 +93,11 @@ static bool stack_trace_consume_entry(void *cookie, unsigned long addr, return c->len < c->size; } -static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr) { if (in_sched_functions(addr)) return true; - return stack_trace_consume_entry(cookie, addr, reliable); + return stack_trace_consume_entry(cookie, addr); } /** diff --git a/kernel/static_call.c b/kernel/static_call.c new file mode 100644 index 000000000000..84565c2a41b8 --- /dev/null +++ b/kernel/static_call.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/static_call.h> +#include <linux/bug.h> +#include <linux/smp.h> +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/processor.h> +#include <asm/sections.h> + +extern struct static_call_site __start_static_call_sites[], + __stop_static_call_sites[]; + +static bool static_call_initialized; + +/* mutex to protect key modules/sites */ +static DEFINE_MUTEX(static_call_mutex); + +static void static_call_lock(void) +{ + mutex_lock(&static_call_mutex); +} + +static void static_call_unlock(void) +{ + mutex_unlock(&static_call_mutex); +} + +static inline void *static_call_addr(struct static_call_site *site) +{ + return (void *)((long)site->addr + (long)&site->addr); +} + + +static inline struct static_call_key *static_call_key(const struct static_call_site *site) +{ + return (struct static_call_key *) + (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS); +} + +/* These assume the key is word-aligned. */ +static inline bool static_call_is_init(struct static_call_site *site) +{ + return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT; +} + +static inline bool static_call_is_tail(struct static_call_site *site) +{ + return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL; +} + +static inline void static_call_set_init(struct static_call_site *site) +{ + site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) - + (long)&site->key; +} + +static int static_call_site_cmp(const void *_a, const void *_b) +{ + const struct static_call_site *a = _a; + const struct static_call_site *b = _b; + const struct static_call_key *key_a = static_call_key(a); + const struct static_call_key *key_b = static_call_key(b); + + if (key_a < key_b) + return -1; + + if (key_a > key_b) + return 1; + + return 0; +} + +static void static_call_site_swap(void *_a, void *_b, int size) +{ + long delta = (unsigned long)_a - (unsigned long)_b; + struct static_call_site *a = _a; + struct static_call_site *b = _b; + struct static_call_site tmp = *a; + + a->addr = b->addr - delta; + a->key = b->key - delta; + + b->addr = tmp.addr + delta; + b->key = tmp.key + delta; +} + +static inline void static_call_sort_entries(struct static_call_site *start, + struct static_call_site *stop) +{ + sort(start, stop - start, sizeof(struct static_call_site), + static_call_site_cmp, static_call_site_swap); +} + +static inline bool static_call_key_has_mods(struct static_call_key *key) +{ + return !(key->type & 1); +} + +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) +{ + if (!static_call_key_has_mods(key)) + return NULL; + + return key->mods; +} + +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) +{ + if (static_call_key_has_mods(key)) + return NULL; + + return (struct static_call_site *)(key->type & ~1); +} + +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + struct static_call_site *site, *stop; + struct static_call_mod *site_mod, first; + + cpus_read_lock(); + static_call_lock(); + + if (key->func == func) + goto done; + + key->func = func; + + arch_static_call_transform(NULL, tramp, func, false); + + /* + * If uninitialized, we'll not update the callsites, but they still + * point to the trampoline and we just patched that. + */ + if (WARN_ON_ONCE(!static_call_initialized)) + goto done; + + first = (struct static_call_mod){ + .next = static_call_key_next(key), + .mod = NULL, + .sites = static_call_key_sites(key), + }; + + for (site_mod = &first; site_mod; site_mod = site_mod->next) { + struct module *mod = site_mod->mod; + + if (!site_mod->sites) { + /* + * This can happen if the static call key is defined in + * a module which doesn't use it. + * + * It also happens in the has_mods case, where the + * 'first' entry has no sites associated with it. + */ + continue; + } + + stop = __stop_static_call_sites; + +#ifdef CONFIG_MODULES + if (mod) { + stop = mod->static_call_sites + + mod->num_static_call_sites; + } +#endif + + for (site = site_mod->sites; + site < stop && static_call_key(site) == key; site++) { + void *site_addr = static_call_addr(site); + + if (static_call_is_init(site)) { + /* + * Don't write to call sites which were in + * initmem and have since been freed. + */ + if (!mod && system_state >= SYSTEM_RUNNING) + continue; + if (mod && !within_module_init((unsigned long)site_addr, mod)) + continue; + } + + if (!kernel_text_address((unsigned long)site_addr)) { + WARN_ONCE(1, "can't patch static call site at %pS", + site_addr); + continue; + } + + arch_static_call_transform(site_addr, NULL, func, + static_call_is_tail(site)); + } + } + +done: + static_call_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(__static_call_update); + +static int __static_call_init(struct module *mod, + struct static_call_site *start, + struct static_call_site *stop) +{ + struct static_call_site *site; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod; + + if (start == stop) + return 0; + + static_call_sort_entries(start, stop); + + for (site = start; site < stop; site++) { + void *site_addr = static_call_addr(site); + + if ((mod && within_module_init((unsigned long)site_addr, mod)) || + (!mod && init_section_contains(site_addr, 1))) + static_call_set_init(site); + + key = static_call_key(site); + if (key != prev_key) { + prev_key = key; + + /* + * For vmlinux (!mod) avoid the allocation by storing + * the sites pointer in the key itself. Also see + * __static_call_update()'s @first. + * + * This allows architectures (eg. x86) to call + * static_call_init() before memory allocation works. + */ + if (!mod) { + key->sites = site; + key->type |= 1; + goto do_transform; + } + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + + /* + * When the key has a direct sites pointer, extract + * that into an explicit struct static_call_mod, so we + * can have a list of modules. + */ + if (static_call_key_sites(key)) { + site_mod->mod = NULL; + site_mod->next = NULL; + site_mod->sites = static_call_key_sites(key); + + key->mods = site_mod; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + } + + site_mod->mod = mod; + site_mod->sites = site; + site_mod->next = static_call_key_next(key); + key->mods = site_mod; + } + +do_transform: + arch_static_call_transform(site_addr, NULL, key->func, + static_call_is_tail(site)); + } + + return 0; +} + +static int addr_conflict(struct static_call_site *site, void *start, void *end) +{ + unsigned long addr = (unsigned long)static_call_addr(site); + + if (addr <= (unsigned long)end && + addr + CALL_INSN_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __static_call_text_reserved(struct static_call_site *iter_start, + struct static_call_site *iter_stop, + void *start, void *end) +{ + struct static_call_site *iter = iter_start; + + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) + return 1; + iter++; + } + + return 0; +} + +#ifdef CONFIG_MODULES + +static int __static_call_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + int ret; + + preempt_disable(); + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + preempt_enable(); + + if (!mod) + return 0; + + ret = __static_call_text_reserved(mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites, + start, end); + + module_put(mod); + + return ret; +} + +static int static_call_add_module(struct module *mod) +{ + return __static_call_init(mod, mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites); +} + +static void static_call_del_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = mod->static_call_sites + + mod->num_static_call_sites; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod, **prev; + struct static_call_site *site; + + for (site = start; site < stop; site++) { + key = static_call_key(site); + if (key == prev_key) + continue; + + prev_key = key; + + for (prev = &key->mods, site_mod = key->mods; + site_mod && site_mod->mod != mod; + prev = &site_mod->next, site_mod = site_mod->next) + ; + + if (!site_mod) + continue; + + *prev = site_mod->next; + kfree(site_mod); + } +} + +static int static_call_module_notify(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + cpus_read_lock(); + static_call_lock(); + + switch (val) { + case MODULE_STATE_COMING: + ret = static_call_add_module(mod); + if (ret) { + WARN(1, "Failed to allocate memory for static calls"); + static_call_del_module(mod); + } + break; + case MODULE_STATE_GOING: + static_call_del_module(mod); + break; + } + + static_call_unlock(); + cpus_read_unlock(); + + return notifier_from_errno(ret); +} + +static struct notifier_block static_call_module_nb = { + .notifier_call = static_call_module_notify, +}; + +#else + +static inline int __static_call_mod_text_reserved(void *start, void *end) +{ + return 0; +} + +#endif /* CONFIG_MODULES */ + +int static_call_text_reserved(void *start, void *end) +{ + int ret = __static_call_text_reserved(__start_static_call_sites, + __stop_static_call_sites, start, end); + + if (ret) + return ret; + + return __static_call_mod_text_reserved(start, end); +} + +int __init static_call_init(void) +{ + int ret; + + if (static_call_initialized) + return 0; + + cpus_read_lock(); + static_call_lock(); + ret = __static_call_init(NULL, __start_static_call_sites, + __stop_static_call_sites); + static_call_unlock(); + cpus_read_unlock(); + + if (ret) { + pr_err("Failed to allocate memory for static_call!\n"); + BUG(); + } + + static_call_initialized = true; + +#ifdef CONFIG_MODULES + register_module_notifier(&static_call_module_nb); +#endif + return 0; +} +early_initcall(static_call_init); + +#ifdef CONFIG_STATIC_CALL_SELFTEST + +static int func_a(int x) +{ + return x+1; +} + +static int func_b(int x) +{ + return x+2; +} + +DEFINE_STATIC_CALL(sc_selftest, func_a); + +static struct static_call_data { + int (*func)(int); + int val; + int expect; +} static_call_data [] __initdata = { + { NULL, 2, 3 }, + { func_b, 2, 4 }, + { func_a, 2, 3 } +}; + +static int __init test_static_call_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { + struct static_call_data *scd = &static_call_data[i]; + + if (scd->func) + static_call_update(sc_selftest, scd->func); + + WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); + } + + return 0; +} +early_initcall(test_static_call_init); + +#endif /* CONFIG_STATIC_CALL_SELFTEST */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d59775ea79c..c925d1e1777e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -369,7 +369,6 @@ COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ COND_SYSCALL(vm86old); COND_SYSCALL(modify_ldt); -COND_SYSCALL_COMPAT(quotactl32); COND_SYSCALL(vm86); COND_SYSCALL(kexec_file_load); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 287862f91717..afad085960b8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -204,8 +204,7 @@ static int max_extfrag_threshold = 1000; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; @@ -2913,6 +2912,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_MMU { .procname = "max_map_count", diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ca223a89530a..f4ace1bf8382 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -908,7 +908,7 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; - alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 95b6a708b040..3624b9b5835d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr hrtimer_debug_descr; +static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { @@ -401,7 +401,7 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr hrtimer_debug_descr = { +static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 1c03eec6ca9b..0642013dace4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -35,7 +35,7 @@ * into a single 64-byte cache line. */ struct clock_data { - seqcount_t seq; + seqcount_latch_t seq; struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; @@ -76,7 +76,7 @@ struct clock_read_data *sched_clock_read_begin(unsigned int *seq) int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } unsigned long long notrace sched_clock(void) @@ -258,7 +258,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c47f388a83f..6858a31364b6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -54,6 +54,9 @@ static struct { static struct timekeeper shadow_timekeeper; +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit @@ -64,7 +67,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_raw_spinlock_t seq; + seqcount_latch_t seq; struct tk_read_base base[2]; }; @@ -73,28 +76,42 @@ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { - return cycles_at_suspend; + if (timekeeping_suspended) + return cycles_at_suspend; + return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; +/* + * Boot time initialization which allows local_clock() to be utilized + * during early boot when clocksources are not available. local_clock() + * returns nanoseconds already so no conversion is required, hence mult=1 + * and shift=0. When the first proper clocksource is installed then + * the fast time keepers are updated with the correct values. + */ +#define FAST_TK_INIT \ + { \ + .clock = &dummy_clock, \ + .mask = CLOCKSOURCE_MASK(64), \ + .mult = 1, \ + .shift = 0, \ + } + static struct tk_fast tk_fast_mono ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, + .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, + .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, }; -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; - static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -467,7 +484,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -513,29 +530,29 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ -static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; + u64 basem, baser, delta; unsigned int seq; - u64 now; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base_real); + basem = ktime_to_ns(tkr->base); + baser = ktime_to_ns(tkr->base_real); - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + delta = timekeeping_delta_to_ns(tkr, + clocksource_delta(tk_clock_read(tkr), + tkr->cycle_last, tkr->mask)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); - return now; + if (mono) + *mono = basem + delta; + return baser + delta; } /** @@ -543,11 +560,65 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) */ u64 ktime_get_real_fast_ns(void) { - return __ktime_get_real_fast_ns(&tk_fast_mono); + return __ktime_get_real_fast(&tk_fast_mono, NULL); } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** + * ktime_get_fast_timestamps: - NMI safe timestamps + * @snapshot: Pointer to timestamp storage + * + * Stores clock monotonic, boottime and realtime timestamps. + * + * Boot time is a racy access on 32bit systems if the sleep time injection + * happens late during resume and not in timekeeping_resume(). That could + * be avoided by expanding struct tk_read_base with boot offset for 32bit + * and adding more overhead to the update. As this is a hard to observe + * once per resume event which can be filtered with reasonable effort using + * the accurate mono/real timestamps, it's probably not worth the trouble. + * + * Aside of that it might be possible on 32 and 64 bit to observe the + * following when the sleep time injection happens late: + * + * CPU 0 CPU 1 + * timekeeping_resume() + * ktime_get_fast_timestamps() + * mono, real = __ktime_get_real_fast() + * inject_sleep_time() + * update boot offset + * boot = mono + bootoffset; + * + * That means that boot time already has the sleep time adjustment, but + * real time does not. On the next readout both are in sync again. + * + * Preventing this for 64bit is not really feasible without destroying the + * careful cache layout of the timekeeper because the sequence count and + * struct tk_read_base would then need two cache lines instead of one. + * + * Access to the time keeper clock source is disabled accross the innermost + * steps of suspend/resume. The accessors still work, but the timestamps + * are frozen until time keeping is resumed which happens very early. + * + * For regular suspend/resume there is no observable difference vs. sched + * clock, but it might affect some of the nasty low level debug printks. + * + * OTOH, access to sched clock is not guaranteed accross suspend/resume on + * all systems either so it depends on the hardware in use. + * + * If that turns out to be a real problem then this could be mitigated by + * using sched clock in a similar way as during early boot. But it's not as + * trivial as on early boot because it needs some careful protection + * against the clock monotonic timestamp jumping backwards on resume. + */ +void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); + snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); +} + +/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a50364df1054..dda05f4b7a1f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -611,7 +611,7 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr timer_debug_descr; +static const struct debug_obj_descr timer_debug_descr; static void *timer_debug_hint(void *addr) { @@ -707,7 +707,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr timer_debug_descr = { +static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, @@ -794,6 +794,8 @@ static void do_init_timer(struct timer_list *timer, { timer->entry.pprev = NULL; timer->function = func; + if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) + flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4b3a42fc3b24..f1022945e346 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -527,7 +527,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (bdev && bdev == bdev->bd_contains) + if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); @@ -793,7 +793,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else -u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } @@ -1827,13 +1827,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = bdget_part(dev_to_part(dev)); struct request_queue *q; - struct block_device *bdev; struct blk_trace *bt; ssize_t ret = -ENXIO; - bdev = bdget(part_devt(p)); if (bdev == NULL) goto out; @@ -1875,7 +1873,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, { struct block_device *bdev; struct request_queue *q; - struct hd_struct *p; struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1895,9 +1892,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; ret = -ENXIO; - - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); + bdev = bdget_part(dev_to_part(dev)); if (bdev == NULL) goto out; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a8d4f253ed77..2ecf7892a31b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2027,10 +2027,11 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, { struct bpf_trace_module *btm, *tmp; struct module *mod = module; + int ret = 0; if (mod->num_bpf_raw_events == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) - return 0; + goto out; mutex_lock(&bpf_module_mutex); @@ -2040,6 +2041,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, if (btm) { btm->module = module; list_add(&btm->list, &bpf_trace_modules); + } else { + ret = -ENOMEM; } break; case MODULE_STATE_GOING: @@ -2055,7 +2058,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, mutex_unlock(&bpf_module_mutex); - return 0; +out: + return notifier_from_errno(ret); } static struct notifier_block bpf_module_nb = { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..541453927c82 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2782,6 +2782,7 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) { lockdep_assert_held(&ftrace_lock); list_del_rcu(&ops->list); + synchronize_rcu(); } /* @@ -2862,6 +2863,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command) __unregister_ftrace_function(ops); ftrace_start_up--; ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + ftrace_trampoline_free(ops); return ret; } @@ -6990,16 +6993,14 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; preempt_disable_notrace(); - op->func(ip, parent_ip, op, regs); + if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) + op->func(ip, parent_ip, op, regs); preempt_enable_notrace(); trace_clear_recursion(bit); @@ -7531,8 +7532,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f40d850ebabc..25b72a73608a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3546,13 +3546,15 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { - kfree(iter->temp); - iter->temp = kmalloc(iter->ent_size, GFP_KERNEL); - if (!iter->temp) + void *temp; + temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!temp) return NULL; + kfree(iter->temp); + iter->temp = temp; + iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); - iter->temp_size = iter->ent_size; iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); @@ -3782,14 +3784,14 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -3810,26 +3812,26 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; - int prec = tgid ? 10 : 2; + const char *space = " "; + int prec = tgid ? 12 : 2; print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void @@ -9072,7 +9074,7 @@ static int trace_module_notify(struct notifier_block *self, break; } - return 0; + return NOTIFY_OK; } static struct notifier_block trace_module_nb = { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a85effb2373b..beebf2cd364b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2646,7 +2646,7 @@ static int trace_module_notify(struct notifier_block *self, mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return NOTIFY_OK; } static struct notifier_block trace_module_nb = { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0b933546142e..1b2ef6490229 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3865,7 +3865,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { - kfree(hist_data->attrs->var_defs.name[n_vars]); ret = -ENOMEM; goto free; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aefb6065b508..19c00ee90945 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -106,9 +106,10 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { - int len = strlen(mod->name); + int len = strlen(module_name(mod)); const char *name = trace_kprobe_symbol(tk); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; + + return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) @@ -688,7 +689,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), - mod->name, ret); + module_name(mod), ret); } } mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4d1893564912..000e9dc224c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -497,7 +497,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%8.8s-%-5d %3d", + trace_seq_printf(s, "%8.8s-%-7d %3d", comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); @@ -588,15 +588,15 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) - trace_seq_printf(s, "(-----) "); + trace_seq_printf(s, "(-------) "); else - trace_seq_printf(s, "(%5d) ", tgid); + trace_seq_printf(s, "(%7d) ", tgid); } trace_seq_printf(s, "[%03d] ", iter->cpu); @@ -636,7 +636,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", + s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx); } else { @@ -917,7 +917,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n", field->prev_pid, field->prev_prio, S, delim, diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f10073e62603..f4938040c228 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -102,14 +102,14 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); NOKPROBE_SYMBOL(trace_hardirqs_off_caller); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index d4e31e969206..bb7783b90361 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -96,7 +96,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, if (val == MODULE_STATE_COMING) hold_module_trace_bprintk_format(start, end); } - return 0; + return NOTIFY_OK; } /* @@ -174,7 +174,7 @@ __init static int module_trace_bprintk_format_notify(struct notifier_block *self, unsigned long val, void *data) { - return 0; + return NOTIFY_OK; } static inline const char ** find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 73956eaff8a9..26efd22f0633 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -221,6 +221,29 @@ static void *func_remove(struct tracepoint_func **funcs, return old; } +static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync) +{ + void *func = tp->iterator; + + /* Synthetic events do not have static call sites */ + if (!tp->static_call_key) + return; + + if (!tp_funcs[1].func) { + func = tp_funcs[0].func; + /* + * If going from the iterator back to a single caller, + * we need to synchronize with __DO_TRACE to make sure + * that the data passed to the callback is the one that + * belongs to that callback. + */ + if (sync) + tracepoint_synchronize_unregister(); + } + + __static_call_update(tp->static_call_key, tp->static_call_tramp, func); +} + /* * Add the probe function to a tracepoint. */ @@ -251,8 +274,9 @@ static int tracepoint_add_func(struct tracepoint *tp, * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); - if (!static_key_enabled(&tp->key)) - static_key_slow_inc(&tp->key); + tracepoint_update_call(tp, tp_funcs, false); + static_key_enable(&tp->key); + release_probes(old); return 0; } @@ -281,10 +305,13 @@ static int tracepoint_remove_func(struct tracepoint *tp, if (tp->unregfunc && static_key_enabled(&tp->key)) tp->unregfunc(); - if (static_key_enabled(&tp->key)) - static_key_slow_dec(&tp->key); + static_key_disable(&tp->key); + rcu_assign_pointer(tp->funcs, tp_funcs); + } else { + rcu_assign_pointer(tp->funcs, tp_funcs); + tracepoint_update_call(tp, tp_funcs, + tp_funcs[0].func != old[0].func); } - rcu_assign_pointer(tp->funcs, tp_funcs); release_probes(old); return 0; } @@ -521,7 +548,7 @@ static int tracepoint_module_notify(struct notifier_block *self, case MODULE_STATE_UNFORMED: break; } - return ret; + return notifier_from_errno(ret); } static struct notifier_block tracepoint_module_nb = { diff --git a/kernel/umh.c b/kernel/umh.c index fcf3ee803630..3f646613a9d3 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -14,6 +14,7 @@ #include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> @@ -72,6 +73,14 @@ static int call_usermodehelper_exec_async(void *data) spin_unlock_irq(¤t->sighand->siglock); /* + * Initial kernel threads share ther FS with init, in order to + * get the init root directory. But we've now created a new + * thread that is going to execve a user process and has its own + * 'struct fs_struct'. Reset umask to the default. + */ + current->fs->umask = 0022; + + /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c41c3c17b86a..ac088ce6059b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -427,7 +427,7 @@ static void show_pwq(struct pool_workqueue *pwq); #ifdef CONFIG_DEBUG_OBJECTS_WORK -static struct debug_obj_descr work_debug_descr; +static const struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { @@ -477,7 +477,7 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr work_debug_descr = { +static const struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .is_static_object = work_is_static_object, |