diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 639 |
1 files changed, 415 insertions, 224 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..d575b4914925 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,26 +6,90 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> -#undef CREATE_TRACE_POINTS +#include <linux/highmem.h> +#include <linux/hrtimer_api.h> +#include <linux/ktime_api.h> +#include <linux/sched/signal.h> +#include <linux/syscalls_api.h> +#include <linux/debug_locks.h> +#include <linux/prefetch.h> +#include <linux/capability.h> +#include <linux/pgtable_api.h> +#include <linux/wait_bit.h> +#include <linux/jiffies.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/hardirq.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/debug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/rt.h> -#include "sched.h" - -#include <linux/nospec.h> #include <linux/blkdev.h> +#include <linux/context_tracking.h> +#include <linux/cpuset.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/interrupt.h> +#include <linux/ioprio.h> +#include <linux/kallsyms.h> #include <linux/kcov.h> +#include <linux/kprobes.h> +#include <linux/llist_api.h> +#include <linux/mmu_context.h> +#include <linux/mmzone.h> +#include <linux/mutex_api.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/perf_event_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/rcuwait_api.h> +#include <linux/sched/wake_q.h> #include <linux/scs.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/vtime.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#ifdef CONFIG_PREEMPT_DYNAMIC +# ifdef CONFIG_GENERIC_ENTRY +# include <linux/entry-common.h> +# endif +#endif + +#include <uapi/linux/sched/types.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include "../workqueue_internal.h" -#include "../../fs/io-wq.h" -#include "../smpboot.h" +#define CREATE_TRACE_POINTS +#include <linux/sched/rseq_api.h> +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS +#include "sched.h" +#include "stats.h" +#include "autogroup.h" + +#include "autogroup.h" #include "pelt.h" #include "smp.h" +#include "stats.h" + +#include "../workqueue_internal.h" +#include "../../fs/io-wq.h" +#include "../smpboot.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -36,6 +100,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -144,7 +209,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct * return false; /* flip prio, so high prio is leftmost */ - if (prio_less(b, a, task_rq(a)->core->core_forceidle)) + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) return true; return false; @@ -181,15 +246,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); } -void sched_core_dequeue(struct rq *rq, struct task_struct *p) +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { rq->core->core_task_seq++; - if (!sched_core_enqueued(p)) - return; + if (sched_core_enqueued(p)) { + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); + } - rb_erase(&p->core_node, &rq->core_tree); - RB_CLEAR_NODE(&p->core_node); + /* + * Migrating the last task off the cpu, with the cpu in forced idle + * state. Reschedule to create an accounting edge for forced idle, + * and re-examine whether the core is still in forced idle state. + */ + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && + rq->core->core_forceidle_count && rq->curr == rq->idle) + resched_curr(rq); } /* @@ -280,6 +353,8 @@ static void __sched_core_flip(bool enabled) for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; + cpu_rq(cpu)->core->core_forceidle_start = 0; + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); @@ -364,7 +439,8 @@ void sched_core_put(void) #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ @@ -1013,13 +1089,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -1035,7 +1111,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); cpu = default_cpu; unlock: rcu_read_unlock(); @@ -1918,7 +1994,7 @@ static void __init init_uclamp_rq(struct rq *rq) }; } - rq->uclamp_flags = 0; + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) @@ -2005,7 +2081,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) - sched_core_dequeue(rq, p); + sched_core_dequeue(rq, p, flags); if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); @@ -2173,6 +2249,9 @@ void migrate_enable(void) return; } + if (WARN_ON_ONCE(!p->migration_disabled)) + return; + /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@ -4265,7 +4344,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4273,13 +4354,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4289,8 +4379,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif @@ -4410,6 +4502,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4425,18 +4518,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4447,7 +4545,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -4811,7 +4912,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - long prev_state; + unsigned int prev_state; /* * The previous task will have left us with a preempt_count of 2 @@ -5244,6 +5345,7 @@ void scheduler_tick(void) if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); + sched_core_tick(rq); rq_unlock(rq, &rf); @@ -5355,7 +5457,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5376,7 +5478,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5656,6 +5758,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *next, *p, *max = NULL; const struct cpumask *smt_mask; bool fi_before = false; + bool core_clock_updated = (rq == rq->core); unsigned long cookie; int i, cpu, occ = 0; struct rq *rq_i; @@ -5708,10 +5811,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* reset state */ rq->core->core_cookie = 0UL; - if (rq->core->core_forceidle) { + if (rq->core->core_forceidle_count) { + if (!core_clock_updated) { + update_rq_clock(rq->core); + core_clock_updated = true; + } + sched_core_account_forceidle(rq); + /* reset after accounting force idle */ + rq->core->core_forceidle_start = 0; + rq->core->core_forceidle_count = 0; + rq->core->core_forceidle_occupation = 0; need_sync = true; fi_before = true; - rq->core->core_forceidle = false; } /* @@ -5753,7 +5864,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); - if (i != cpu) + /* + * Current cpu always has its clock updated on entrance to + * pick_next_task(). If the current cpu is not the core, + * the core may also have been updated above. + */ + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); p = rq_i->core_pick = pick_task(rq_i); @@ -5783,7 +5899,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (p == rq_i->idle) { if (rq_i->nr_running) { - rq->core->core_forceidle = true; + rq->core->core_forceidle_count++; if (!fi_before) rq->core->core_forceidle_seq++; } @@ -5792,6 +5908,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } } + if (schedstat_enabled() && rq->core->core_forceidle_count) { + rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_occupation = occ; + } + rq->core->core_pick_seq = rq->core->core_task_seq; next = rq->core_pick; rq->core_sched_seq = rq->core->core_pick_seq; @@ -5828,8 +5949,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * 1 0 1 * 1 1 0 */ - if (!(fi_before && rq->core->core_forceidle)) - task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + if (!(fi_before && rq->core->core_forceidle_count)) + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); rq_i->core_pick->core_occupation = occ; @@ -6033,11 +6154,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu) goto unlock; /* copy the shared state to the new leader */ - core_rq->core_task_seq = rq->core_task_seq; - core_rq->core_pick_seq = rq->core_pick_seq; - core_rq->core_cookie = rq->core_cookie; - core_rq->core_forceidle = rq->core_forceidle; - core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle_count = rq->core_forceidle_count; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + + /* + * Accounting edge for forced idle is handled in pick_next_task(). + * Don't need another one here, since the hotplug thread shouldn't + * have a cookie. + */ + core_rq->core_forceidle_start = 0; /* install new leader */ for_each_cpu(t, smt_mask) { @@ -6247,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6302,8 +6431,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (blk_needs_flush_plug(tsk)) - blk_flush_plug(tsk->plug, true); + blk_flush_plug(tsk->plug, true); } static void sched_update_worker(struct task_struct *tsk) @@ -6440,17 +6568,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_dynamic_enabled +#define preempt_schedule_dynamic_enabled preempt_schedule +#define preempt_schedule_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); +void __sched notrace dynamic_preempt_schedule(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) + return; + preempt_schedule(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule); +EXPORT_SYMBOL(dynamic_preempt_schedule); +#endif #endif - /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -6505,147 +6647,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); -EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_notrace_dynamic_enabled +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +#define preempt_schedule_notrace_dynamic_disabled NULL #endif - -#endif /* CONFIG_PREEMPTION */ - -#ifdef CONFIG_PREEMPT_DYNAMIC - -#include <linux/entry-common.h> - -/* - * SC:cond_resched - * SC:might_resched - * SC:preempt_schedule - * SC:preempt_schedule_notrace - * SC:irqentry_exit_cond_resched - * - * - * NONE: - * cond_resched <- __cond_resched - * might_resched <- RET0 - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * VOLUNTARY: - * cond_resched <- __cond_resched - * might_resched <- __cond_resched - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * FULL: - * cond_resched <- RET0 - * might_resched <- RET0 - * preempt_schedule <- preempt_schedule - * preempt_schedule_notrace <- preempt_schedule_notrace - * irqentry_exit_cond_resched <- irqentry_exit_cond_resched - */ - -enum { - preempt_dynamic_undefined = -1, - preempt_dynamic_none, - preempt_dynamic_voluntary, - preempt_dynamic_full, -}; - -int preempt_dynamic_mode = preempt_dynamic_undefined; - -int sched_dynamic_mode(const char *str) -{ - if (!strcmp(str, "none")) - return preempt_dynamic_none; - - if (!strcmp(str, "voluntary")) - return preempt_dynamic_voluntary; - - if (!strcmp(str, "full")) - return preempt_dynamic_full; - - return -EINVAL; -} - -void sched_dynamic_update(int mode) -{ - /* - * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in - * the ZERO state, which is invalid. - */ - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - - switch (mode) { - case preempt_dynamic_none: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: none\n"); - break; - - case preempt_dynamic_voluntary: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: voluntary\n"); - break; - - case preempt_dynamic_full: - static_call_update(cond_resched, (void *)&__static_call_return0); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: full\n"); - break; - } - - preempt_dynamic_mode = mode; -} - -static int __init setup_preempt_mode(char *str) -{ - int mode = sched_dynamic_mode(str); - if (mode < 0) { - pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 1; - } - - sched_dynamic_update(mode); - return 0; -} -__setup("preempt=", setup_preempt_mode); - -static void __init preempt_dynamic_init(void) +DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); +void __sched notrace dynamic_preempt_schedule_notrace(void) { - if (preempt_dynamic_mode == preempt_dynamic_undefined) { - if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { - sched_dynamic_update(preempt_dynamic_none); - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { - sched_dynamic_update(preempt_dynamic_voluntary); - } else { - /* Default static call setting, nothing to do */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); - preempt_dynamic_mode = preempt_dynamic_full; - pr_info("Dynamic Preempt: full\n"); - } - } + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) + return; + preempt_schedule_notrace(); } +NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); +EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); +#endif +#endif -#else /* !CONFIG_PREEMPT_DYNAMIC */ - -static inline void preempt_dynamic_init(void) { } - -#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ +#endif /* CONFIG_PREEMPTION */ /* * This is the entry point to schedule() from kernel preemption @@ -7126,7 +7148,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long sched_cpu_util(int cpu, unsigned long max) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, + return effective_cpu_util(cpu, cpu_util_cfs(cpu), max, ENERGY_UTIL, NULL); } #endif /* CONFIG_SMP */ @@ -8152,11 +8174,35 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define cond_resched_dynamic_enabled __cond_resched +#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); +#define might_resched_dynamic_enabled __cond_resched +#define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); +int __sched dynamic_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_cond_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_cond_resched); + +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); +int __sched dynamic_might_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_might_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_might_resched); +#endif #endif /* @@ -8176,9 +8222,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8196,9 +8240,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8216,9 +8258,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); @@ -8227,6 +8267,154 @@ int __cond_resched_rwlock_write(rwlock_t *lock) } EXPORT_SYMBOL(__cond_resched_rwlock_write); +#ifdef CONFIG_PREEMPT_DYNAMIC + +#ifdef CONFIG_GENERIC_ENTRY +#include <linux/entry-common.h> +#endif + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_undefined = -1, + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +int preempt_dynamic_mode = preempt_dynamic_undefined; + +int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return preempt_dynamic_none; + + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; + + if (!strcmp(str, "full")) + return preempt_dynamic_full; + + return -EINVAL; +} + +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#else +#error "Unsupported PREEMPT_DYNAMIC mechanism" +#endif + +void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 0; + } + + sched_dynamic_update(mode); + return 1; +} +__setup("preempt=", setup_preempt_mode); + +static void __init preempt_dynamic_init(void) +{ + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } + } +} + +#else /* !CONFIG_PREEMPT_DYNAMIC */ + +static inline void preempt_dynamic_init(void) { } + +#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ + /** * yield - yield the current processor to other threads. * @@ -8335,9 +8523,7 @@ int io_schedule_prepare(void) int old_iowait = current->in_iowait; current->in_iowait = 1; - if (current->plug) - blk_flush_plug(current->plug, true); - + blk_flush_plug(current->plug, true); return old_iowait; } @@ -8520,7 +8706,7 @@ void sched_show_task(struct task_struct *p) rcu_read_unlock(); pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", free, task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); + read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -8599,14 +8785,6 @@ void __init init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); - /* - * The idle task doesn't need the kthread struct to function, but it - * is dressed up as a per-CPU kthread and thus needs to play the part - * if we want to avoid special-casing it in code that deals with per-CPU - * kthreads. - */ - set_kthread_struct(idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -8619,9 +8797,6 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - #ifdef CONFIG_SMP /* * It's possible that init_idle() gets called multiple times on a task, @@ -8675,7 +8850,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, { int ret = 1; - if (!cpumask_weight(cur)) + if (cpumask_empty(cur)) return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); @@ -8703,8 +8878,11 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); + cs_cpus_allowed)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + + ret = dl_cpu_busy(cpu, p); + } out: return ret; @@ -8777,7 +8955,6 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } - scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } @@ -8989,8 +9166,10 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; + int ret = dl_cpu_busy(cpu, NULL); + + if (ret) + return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; @@ -9020,6 +9199,7 @@ int sched_cpu_activate(unsigned int cpu) set_cpu_active(cpu, true); if (sched_smp_initialized) { + sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); } @@ -9098,10 +9278,12 @@ int sched_cpu_deactivate(unsigned int cpu) if (!sched_smp_initialized) return 0; + sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { balance_push_set(cpu, false); set_cpu_active(cpu, true); + sched_update_numa(cpu, true); return ret; } sched_domains_numa_masks_clear(cpu); @@ -9204,7 +9386,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { - sched_init_numa(); + sched_init_numa(NUMA_NO_NODE); /* * There's no userspace yet to cause hotplug operations; hence all the @@ -9216,7 +9398,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); @@ -9316,7 +9498,6 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); @@ -9413,7 +9594,9 @@ void __init sched_init(void) rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; - rq->core_forceidle = false; + rq->core_forceidle_count = 0; + rq->core_forceidle_occupation = 0; + rq->core_forceidle_start = 0; rq->core_cookie = 0UL; #endif @@ -9428,6 +9611,14 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + WARN_ON(!set_kthread_struct(current)); + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again |