diff options
Diffstat (limited to 'kernel')
33 files changed, 405 insertions, 166 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 6c34e63c88ff..4d111f871951 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,7 +97,7 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" - default y + default ARCH_DEFAULT_CRASH_DUMP depends on ARCH_SUPPORTS_CRASH_DUMP depends on KEXEC_CORE select VMCORE_INFO diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..e16e79f8cd6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..1eb9852a9f8e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e7113d700b87..025d7e2214ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -24,6 +24,23 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* + * cgroup bpf destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup bpf + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_bpf_destroy_wq; + +static int __init cgroup_bpf_wq_init(void) +{ + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + if (!cgroup_bpf_destroy_wq) + panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); + return 0; +} +core_initcall(cgroup_bpf_wq_init); + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); - queue_work(system_wq, &cgrp->bpf.release_work); + queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ca3f0a2e5ed5..3d45ebe8afb4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2851,21 +2851,47 @@ struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); +#define BITS_ITER_NR_WORDS_MAX 511 + struct bpf_iter_bits_kern { union { - unsigned long *bits; - unsigned long bits_copy; + __u64 *bits; + __u64 bits_copy; }; - u32 nr_bits; + int nr_bits; int bit; } __aligned(8); +/* On 64-bit hosts, unsigned long and u64 have the same size, so passing + * a u64 pointer and an unsigned long pointer to find_next_bit() will + * return the same result, as both point to the same 8-byte area. + * + * For 32-bit little-endian hosts, using a u64 pointer or unsigned long + * pointer also makes no difference. This is because the first iterated + * unsigned long is composed of bits 0-31 of the u64 and the second unsigned + * long is composed of bits 32-63 of the u64. + * + * However, for 32-bit big-endian hosts, this is not the case. The first + * iterated unsigned long will be bits 32-63 of the u64, so swap these two + * ulong values within the u64. + */ +static void swap_ulong_in_u64(u64 *bits, unsigned int nr) +{ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) + unsigned int i; + + for (i = 0; i < nr; i++) + bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); +#endif +} + /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. - * Due to the limitation of memalloc, it can't be greater than 512. + * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be + * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It @@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (!unsafe_ptr__ign || !nr_words) return -EINVAL; + if (nr_words > BITS_ITER_NR_WORDS_MAX) + return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { @@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (err) return -EFAULT; + swap_ulong_in_u64(&kit->bits_copy, nr_words); + kit->nr_bits = nr_bits; return 0; } + if (bpf_mem_alloc_check_size(false, nr_bytes)) + return -E2BIG; + /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) @@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return err; } + swap_ulong_in_u64(kit->bits, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; - u32 nr_bits = kit->nr_bits; - const unsigned long *bits; - int bit; + int bit = kit->bit, nr_bits = kit->nr_bits; + const void *bits; - if (nr_bits == 0) + if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; - bit = find_next_bit(bits, nr_bits, kit->bit + 1); + bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { - kit->nr_bits = 0; + kit->bit = bit; return NULL; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0218a5132ab5..9b60eda0f727 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..146f5b57cfb1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5af9e130e500..98d9b4c0daff 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/fs.h> -#include <linux/fdtable.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include <linux/bpf.h> #include <linux/vmalloc.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587a6c76e564..bb99bada7e2e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; - int min_valid_off, max_bpf_stack; - - /* If accessing instruction is a spill/fill from bpf_fastcall pattern, - * add room for all caller saved registers below MAX_BPF_STACK. - * In case if bpf_fastcall rewrite won't happen maximal stack depth - * would be checked by check_max_stack_depth_subprog(). - */ - max_bpf_stack = MAX_BPF_STACK; - if (aux->fastcall_pattern) - max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; + int min_valid_off; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -max_bpf_stack; + min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -17886,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; - bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); - bool add_new_state = force_new_state; - bool force_exact; + bool force_new_state, add_new_state, force_exact; + + force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -17898,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ + add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5886b95c6eae..044c7ba1cc48 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; - int level = 1; + int level = 0; lockdep_assert_held(&cgroup_mutex); @@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; - if (level > cgroup->max_depth) + if (level >= cgroup->max_depth) goto fail; level++; diff --git a/kernel/events/core.c b/kernel/events/core.c index cdd09769e6c5..df27d08a7232 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13959,7 +13959,7 @@ static void perf_event_clear_cpumask(unsigned int cpu) } /* migrate */ - list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + list_for_each_entry(pmu, &pmus, entry) { if (pmu->scope == PERF_PMU_SCOPE_NONE || WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) continue; diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33b..1dcddfe537ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> diff --git a/kernel/fork.c b/kernel/fork.c index 89ceb4a68af2..22f43721d031 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,6 +105,7 @@ #include <linux/rseq.h> #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> +#include <linux/tick.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -653,11 +654,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - khugepaged_fork(mm, oldmm); - /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) @@ -760,6 +756,8 @@ loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); } else if (mpnt) { /* * The entire maple tree has already been duplicated. If the @@ -775,7 +773,10 @@ out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); - dup_userfaultfd_complete(&uf); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; @@ -2292,6 +2293,7 @@ __latent_entropy struct task_struct *copy_process( acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); + tick_dep_init_task(p); p->io_context = NULL; audit_set_context(p, NULL); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3a24d6b5f559..396a067a8a56 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -718,7 +718,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); if (ret < 0) { if (ops->msi_free) { - for (i--; i > 0; i--) + for (i--; i >= 0; i--) ops->msi_free(domain, info, virq + i); } irq_domain_free_irqs_top(domain, virq, nr_irqs); diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b0639f21041f..2c596851f8a9 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, idx); - rcu_read_unlock(); + file = fget_task(task, idx); if (file) fput(file); diff --git a/kernel/module/dups.c b/kernel/module/dups.c index 9a92f2f8c9d3..bd2149fbe117 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -18,7 +18,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c index 0800d9891692..25f253812512 100644 --- a/kernel/module/kmod.c +++ b/kernel/module/kmod.c @@ -15,7 +15,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> diff --git a/kernel/resource.c b/kernel/resource.c index b730bd28b422..4101016e8b20 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, rams_size += 16; } - rams[i].start = res.start; - rams[i++].end = res.end; - + rams[i++] = res; start = res.end + 1; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dbfb5717d6af..a1c353a62c56 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; #ifdef CONFIG_SCHED_CLASS_EXT - } else if (task_should_scx(p)) { + } else if (task_should_scx(p->policy)) { p->sched_class = &ext_sched_class; #endif } else { @@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, #ifdef CONFIG_SCHED_CLASS_EXT /* - * SCX requires a balance() call before every pick_next_task() including - * when waking up from SCHED_IDLE. If @start_class is below SCX, start - * from SCX instead. + * SCX requires a balance() call before every pick_task() including when + * waking up from SCHED_IDLE. If @start_class is below SCX, start from + * SCX instead. Also, set a flag to detect missing balance() call. */ - if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; + if (scx_enabled()) { + rq->scx.flags |= SCX_RQ_BAL_PENDING; + if (sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; + } #endif /* @@ -7025,7 +7028,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(int policy, int prio) { if (dl_prio(prio)) return &dl_sched_class; @@ -7034,7 +7037,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) return &rt_sched_class; #ifdef CONFIG_SCHED_CLASS_EXT - if (task_should_scx(p)) + if (task_should_scx(policy)) return &ext_sched_class; #endif @@ -7142,7 +7145,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; - next_class = __setscheduler_class(p, prio); + next_class = __setscheduler_class(p->policy, prio); if (prev_class != next_class && p->se.sched_delayed) dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5900b06fd036..751d73d500e5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static int scx_ops_bypass_depth; +static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -2633,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); if (static_branch_unlikely(&scx_ops_cpu_preempt) && unlikely(rq->scx.cpu_released)) { @@ -2644,7 +2645,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in scx_next_task_picked(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2947,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; /* - * If balance_scx() is telling us to keep running @prev, replenish slice - * if necessary and keep running @prev. Otherwise, pop the first one - * from the local DSQ. - * * WORKAROUND: * * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just @@ -2961,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq) * which then ends up calling pick_task_scx() without preceding * balance_scx(). * - * For now, ignore cases where $prev is not on SCX. This isn't great and - * can theoretically lead to stalls. However, for switch_all cases, this - * happens only while a BPF scheduler is being loaded or unloaded, and, - * for partial cases, fair will likely keep triggering this CPU. + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. * - * Once fair is fixed, restore WARN_ON_ONCE(). + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). */ - if ((rq->scx.flags & SCX_RQ_BAL_KEEP) && - prev->sched_class == &ext_sched_class) { + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev_on_scx) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && !prev_on_scx)) { + /* only allowed during transitions */ + WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. + */ + if (keep_prev) { p = prev; if (!p->scx.slice) p->scx.slice = SCX_SLICE_DFL; } else { p = first_local_task(rq); - if (!p) + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); return NULL; + } if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { @@ -4256,14 +4275,14 @@ static const struct kset_uevent_ops scx_uevent_ops = { * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled. */ -bool task_should_scx(struct task_struct *p) +bool task_should_scx(int policy) { if (!scx_enabled() || unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; - return p->policy == SCHED_EXT; + return policy == SCHED_EXT; } /** @@ -4298,18 +4317,20 @@ bool task_should_scx(struct task_struct *p) */ static void scx_ops_bypass(bool bypass) { - int depth, cpu; + int cpu; + unsigned long flags; + raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); if (bypass) { - depth = atomic_inc_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth <= 0); - if (depth != 1) - return; + scx_ops_bypass_depth++; + WARN_ON_ONCE(scx_ops_bypass_depth <= 0); + if (scx_ops_bypass_depth != 1) + goto unlock; } else { - depth = atomic_dec_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth < 0); - if (depth != 0) - return; + scx_ops_bypass_depth--; + WARN_ON_ONCE(scx_ops_bypass_depth < 0); + if (scx_ops_bypass_depth != 0) + goto unlock; } /* @@ -4326,7 +4347,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; - rq_lock_irqsave(rq, &rf); + rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4362,11 +4383,13 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); /* resched to restore ticks and idle state */ resched_cpu(cpu); } +unlock: + raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) @@ -4489,11 +4512,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->sched_class = __setscheduler_class(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -4969,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } @@ -5199,12 +5227,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = SCX_SLICE_DFL; - p->sched_class = __setscheduler_class(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 246019519231..b1675bb59fc4 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq); void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); -bool task_should_scx(struct task_struct *p); +bool task_should_scx(int policy); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c157d4860a3b..2d16c8545c71 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3369,7 +3369,7 @@ retry_pids: vma = vma_next(&vmi); } - do { + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); @@ -3491,7 +3491,7 @@ retry_pids: */ if (vma_pids_forced) break; - } for_each_vma(vmi, vma); + } /* * If no VMAs are remaining and VMAs were skipped due to the PID @@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - SCHED_WARN_ON(se->sched_delayed); - SCHED_WARN_ON(se->on_rq); + /* + * Must not reference @se again, see __block_task(). + */ return NULL; } return se; @@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); - /* Fix-up what block_task() skipped. */ + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ __block_task(rq, p); } @@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) { - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) return false; - } - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 081519ffab46..c03b3d7b320e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -751,8 +751,9 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */ - SCX_RQ_BYPASSING = 1 << 3, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -2769,8 +2770,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) static inline void __block_task(struct rq *rq, struct task_struct *p) { - WRITE_ONCE(p->on_rq, 0); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -2778,6 +2777,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); } extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -3800,7 +3831,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(int policy, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 0470bcc3d204..f9bed5ec719e 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -707,7 +707,7 @@ change: } prev_class = p->sched_class; - next_class = __setscheduler_class(p, newprio); + next_class = __setscheduler_class(policy, newprio); if (prev_class != next_class && p->se.sched_delayed) dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); @@ -1081,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. @@ -1164,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, #endif } - return sched_attr_copy_to_user(uattr, &kattr, usize); + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } #ifdef CONFIG_SMP diff --git a/kernel/signal.c b/kernel/signal.c index 4344860ffcac..cbabb2d05e0a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, */ rcu_read_lock(); ucounts = task_ucounts(t); - sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); + sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, + override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf311..962b2a31f015 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -2394,6 +2411,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d..badeb222eab9 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -17,6 +17,9 @@ #define NUM_BINS 32 +/* Incremented every time mg_floor is updated */ +DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_sleep_time_show(struct seq_file *s, void *data) @@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } +unsigned long timekeeping_get_mg_floor_swaps(void) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu)); + + return sum; +} diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ca2787d1642..0bbae825bc02 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -10,9 +10,24 @@ * timekeeping debug functions */ #ifdef CONFIG_DEBUG_FS + +DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ + this_cpu_inc(timekeeping_mg_floor_swaps); +} + extern void tk_debug_account_sleep_time(const struct timespec64 *t); + #else + #define tk_debug_account_sleep_time(x) + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ +} + #endif #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ea4f7bb1837..5807116bcd0b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2337,12 +2337,9 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers[cpu]) goto fail_free_buffers; - /* If already mapped, do not hook to CPU hotplug */ - if (!start) { - ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); - if (ret < 0) - goto fail_free_buffers; - } + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; mutex_init(&buffer->mutex); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8f52b6527ca..6a891e00aa7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2386,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf) ring_buffer_record_enable(buffer); } +static void tracing_reset_all_cpus(struct array_buffer *buf) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset(buffer); + + ring_buffer_record_enable(buffer); +} + /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { @@ -5501,6 +5520,10 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" + "By default tracefs removes all OTH file permission bits.\n" + "When mounting tracefs an optional group id can be specified\n" + "which adds the group to every directory and file in tracefs:\n\n" + "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" @@ -6141,8 +6164,13 @@ static void update_last_data(struct trace_array *tr) if (!tr->text_delta && !tr->data_delta) return; - /* Clear old data */ - tracing_reset_online_cpus(&tr->array_buffer); + /* + * Need to clear all CPU buffers as there cannot be events + * from the previous boot mixed with events with this boot + * as that will cause a confusing trace. Need to clear all + * CPU buffers, even for those that may currently be offline. + */ + tracing_reset_all_cpus(&tr->array_buffer); /* Using current data now */ tr->text_delta = 0; diff --git a/kernel/ucount.c b/kernel/ucount.c index 8c07714ff27d..696406939be5 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) do_dec_rlimit_put_ucounts(ucounts, NULL, type); } -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) - goto unwind; + goto dec_unwind; if (iter == ucounts) ret = new; - max = get_userns_rlimit_max(iter->ns, type); + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); -unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } diff --git a/kernel/umh.c b/kernel/umh.c index ff1f13a27d29..be9234270777 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -13,7 +13,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> |