diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 540 |
1 files changed, 389 insertions, 151 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3951e4a55e5..43e453ab7e20 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -163,13 +163,19 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; - if (rt_prio(p->prio)) /* includes deadline */ + if (p->dl_server) + return -1; /* deadline */ + + if (rt_or_dl_prio(p->prio)) return p->prio; /* [-1, 99] */ if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ } /* @@ -192,12 +198,33 @@ static inline bool prio_less(const struct task_struct *a, if (-pb < -pa) return false; - if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ - return !dl_time_before(a->dl.deadline, b->dl.deadline); + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ + const struct sched_dl_entity *a_dl, *b_dl; + + a_dl = &a->dl; + /* + * Since,'a' and 'b' can be CFS tasks served by DL server, + * __task_prio() can return -1 (for DL) even for those. In that + * case, get to the dl_server's DL entity. + */ + if (a->dl_server) + a_dl = a->dl_server; + + b_dl = &b->dl; + if (b->dl_server) + b_dl = b->dl_server; + + return !dl_time_before(a_dl->deadline, b_dl->deadline); + } if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + return false; } @@ -240,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) void sched_core_enqueue(struct rq *rq, struct task_struct *p) { + if (p->se.sched_delayed) + return; + rq->core->core_task_seq++; if (!p->core_cookie) @@ -250,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { + if (p->se.sched_delayed) + return; + rq->core->core_task_seq++; if (sched_core_enqueued(p)) { @@ -1255,11 +1288,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (scx_enabled() && !scx_can_stop_tick(rq)) + return false; + + if (rq->cfs.nr_running > 1) return false; /* @@ -1269,7 +1305,7 @@ bool sched_can_stop_tick(struct rq *rq) * dequeued by migrating while the constrained task continues to run. * E.g. going from 2->1 without going through pick_next_task(). */ - if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (__need_bw_check(rq, rq->curr)) { if (cfs_task_bw_constrained(rq->curr)) return false; } @@ -1341,8 +1377,8 @@ void set_load_weight(struct task_struct *p, bool update_load) * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) - reweight_task(p, &lw); + if (update_load && p->sched_class->reweight_task) + p->sched_class->reweight_task(task_rq(p), p, &lw); else p->se.load = lw; } @@ -1672,6 +1708,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); @@ -1696,6 +1735,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id); } @@ -1975,14 +2017,21 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); } - uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + /* + * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear + * ->sched_delayed. + */ + uclamp_rq_inc(rq, p); if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } -void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +/* + * Must only return false when DEQUEUE_SLEEP. + */ +inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) sched_core_dequeue(rq, p, flags); @@ -1995,8 +2044,12 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } + /* + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' + * and mark the task ->sched_delayed. + */ uclamp_rq_dec(rq, p); - p->sched_class->dequeue_task(rq, p, flags); + return p->sched_class->dequeue_task(rq, p, flags); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -2014,12 +2067,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); + /* + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* + * dequeue_task() and cleared *after* enqueue_task(). + */ + dequeue_task(rq, p, flags); } +static void block_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) + __block_task(rq, p); +} + /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -2032,6 +2098,17 @@ inline int task_curr(const struct task_struct *p) } /* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + +/* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. * @@ -2233,6 +2310,12 @@ void migrate_disable(void) struct task_struct *p = current; if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif p->migration_disabled++; return; } @@ -2251,14 +2334,20 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + if (p->migration_disabled > 1) { p->migration_disabled--; return; } - if (WARN_ON_ONCE(!p->migration_disabled)) - return; - /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@ -2289,7 +2378,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { /* When not in the task's cpumask, no point in looking further. */ - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!task_allowed_on_cpu(p, cpu)) return false; /* migrate_disabled() must be allowed to finish. */ @@ -2298,7 +2387,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu) && task_cpu_possible(cpu, p); + return cpu_active(cpu); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -3607,8 +3696,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif - - p->dl_server = NULL; } /* @@ -3644,12 +3731,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if * it should preempt the task that is current now. */ - update_rq_clock(rq); wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); @@ -3776,6 +3865,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* + * The BPF scheduler may depend on select_task_rq() being invoked during + * wakeups. In addition, @p may end up executing on a different CPU + * regardless of what happens in the wakeup path making the ttwu_queue + * optimization less meaningful. Skip if on SCX. + */ + if (task_on_scx(p)) + return false; + + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. */ @@ -4029,11 +4127,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * + * Specifically, given current runs ttwu() we must be before + * schedule()'s block_task(), as such this must not observe + * sched_delayed. + * * In particular: * - we rely on Program-Order guarantees for all the ordering, * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ + SCHED_WARN_ON(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4322,9 +4425,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; - p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); + /* A delayed task cannot be in clone(). */ + SCHED_WARN_ON(p->se.sched_delayed); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif @@ -4342,6 +4447,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; +#ifdef CONFIG_SCHED_CLASS_EXT + init_scx_entity(&p->scx); +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -4572,6 +4681,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->se.custom_slice = 0; + p->se.slice = sysctl_sched_base_slice; /* * We don't need the reset flag anymore after the fork. It has @@ -4582,10 +4693,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (dl_prio(p->prio)) return -EAGAIN; - else if (rt_prio(p->prio)) + + scx_pre_fork(p); + + if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; - else +#ifdef CONFIG_SCHED_CLASS_EXT + } else if (task_should_scx(p)) { + p->sched_class = &ext_sched_class; +#endif + } else { p->sched_class = &fair_sched_class; + } init_entity_runnable_average(&p->se); @@ -4605,7 +4724,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; @@ -4632,11 +4751,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return scx_fork(p); +} + +void sched_cancel_fork(struct task_struct *p) +{ + scx_cancel_fork(p); } void sched_post_fork(struct task_struct *p) { uclamp_post_fork(p); + scx_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) @@ -4686,7 +4813,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); - activate_task(rq, p, ENQUEUE_NOCLOCK); + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -5469,6 +5596,7 @@ void sched_tick(void) calc_global_load_tick(rq); sched_core_tick(rq); task_tick_mm_cid(rq, curr); + scx_tick(rq); rq_unlock(rq, &rf); @@ -5481,8 +5609,10 @@ void sched_tick(void) wq_worker_tick(curr); #ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - sched_balance_trigger(rq); + if (!scx_switched_all()) { + rq->idle_balance = idle_cpu(cpu); + sched_balance_trigger(rq); + } #endif } @@ -5762,18 +5892,29 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CONTEXT_USER); + SCHED_WARN_ON(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq()->sched_count); } -static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static void prev_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) { -#ifdef CONFIG_SMP + const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; + +#ifdef CONFIG_SCHED_CLASS_EXT + /* + * SCX requires a balance() call before every pick_next_task() including + * when waking up from SCHED_IDLE. If @start_class is below SCX, start + * from SCX instead. + */ + if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; +#endif + /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -5782,13 +5923,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) + for_active_class_range(class, start_class, &idle_sched_class) { + if (class->balance && class->balance(rq, prev, rf)) break; } -#endif - - put_prev_task(rq, prev); } /* @@ -5800,6 +5938,11 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; + rq->dl_server = NULL; + + if (scx_enabled()) + goto restart; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@ -5815,35 +5958,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - put_prev_task(rq, prev); - p = pick_next_task_idle(rq); + p = pick_task_idle(rq); + put_prev_set_next_task(rq, prev, p); } - /* - * This is the fast path; it cannot be a DL server pick; - * therefore even if @p == @prev, ->dl_server must be NULL. - */ - if (p->dl_server) - p->dl_server = NULL; - return p; } restart: - put_prev_task_balance(rq, prev, rf); - - /* - * We've updated @prev and no longer need the server link, clear it. - * Must be done before ->pick_next_task() because that can (re)set - * ->dl_server. - */ - if (prev->dl_server) - prev->dl_server = NULL; + prev_balance(rq, prev, rf); - for_each_class(class) { - p = class->pick_next_task(rq); - if (p) - return p; + for_each_active_class(class) { + if (class->pick_next_task) { + p = class->pick_next_task(rq, prev); + if (p) + return p; + } else { + p = class->pick_task(rq); + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; + } + } } BUG(); /* The idle class should always have a runnable task. */ @@ -5873,7 +6009,9 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; - for_each_class(class) { + rq->dl_server = NULL; + + for_each_active_class(class) { p = class->pick_task(rq); if (p) return p; @@ -5911,6 +6049,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * another cpu during offline. */ rq->core_pick = NULL; + rq->core_dl_server = NULL; return __pick_next_task(rq, prev, rf); } @@ -5929,16 +6068,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); next = rq->core_pick; - if (next != prev) { - put_prev_task(rq, prev); - set_next_task(rq, next); - } - + rq->dl_server = rq->core_dl_server; rq->core_pick = NULL; - goto out; + rq->core_dl_server = NULL; + goto out_set_next; } - put_prev_task_balance(rq, prev, rf); + prev_balance(rq, prev, rf); smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; @@ -5979,6 +6115,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) next = pick_task(rq); if (!next->core_cookie) { rq->core_pick = NULL; + rq->core_dl_server = NULL; /* * For robustness, update the min_vruntime_fi for * unconstrained picks as well. @@ -6006,7 +6143,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - p = rq_i->core_pick = pick_task(rq_i); + rq_i->core_pick = p = pick_task(rq_i); + rq_i->core_dl_server = rq_i->dl_server; + if (!max || prio_less(max, p, fi_before)) max = p; } @@ -6030,6 +6169,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } rq_i->core_pick = p; + rq_i->core_dl_server = NULL; if (p == rq_i->idle) { if (rq_i->nr_running) { @@ -6090,6 +6230,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i == cpu) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; } @@ -6098,6 +6239,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (rq_i->curr == rq_i->core_pick) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; } @@ -6105,8 +6247,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } out_set_next: - set_next_task(rq, next); -out: + put_prev_set_next_task(rq, prev, next); if (rq->core->core_forceidle_count && next == rq->idle) queue_core_balance(rq); @@ -6342,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Constants for the sched_mode argument of __schedule(). * * The mode argument allows RT enabled kernels to differentiate a - * preemption from blocking on an 'sleeping' spin/rwlock. Note that - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to - * optimize the AND operation out and just check for zero. + * preemption from blocking on an 'sleeping' spin/rwlock. */ -#define SM_NONE 0x0 -#define SM_PREEMPT 0x1 -#define SM_RTLOCK_WAIT 0x2 - -#ifndef CONFIG_PREEMPT_RT -# define SM_MASK_PREEMPT (~0U) -#else -# define SM_MASK_PREEMPT SM_PREEMPT -#endif +#define SM_IDLE (-1) +#define SM_NONE 0 +#define SM_PREEMPT 1 +#define SM_RTLOCK_WAIT 2 /* * __schedule() is the main scheduler function. @@ -6395,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(unsigned int sched_mode) +static void __sched notrace __schedule(int sched_mode) { struct task_struct *prev, *next; + /* + * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted + * as a preemption by schedule_debug() and RCU. + */ + bool preempt = sched_mode > SM_NONE; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6408,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, !!sched_mode); + schedule_debug(prev, preempt); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(!!sched_mode); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6443,22 +6582,33 @@ static void __sched notrace __schedule(unsigned int sched_mode) switch_count = &prev->nivcsw; + /* Task state changes only considers SM_PREEMPT as preemption */ + preempt = sched_mode == SM_PREEMPT; + /* * We must load prev->state once (task_struct::state is volatile), such * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); - if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { + if (sched_mode == SM_IDLE) { + /* SCX must consult the BPF scheduler to tell if rq is empty */ + if (!rq->nr_running && !scx_enabled()) { + next = prev; + goto picked; + } + } else if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { + int flags = DEQUEUE_NOCLOCK; + prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && !(prev_state & TASK_FROZEN); - if (prev->sched_contributes_to_load) - rq->nr_uninterruptible++; + if (unlikely(is_special_task_state(prev_state))) + flags |= DEQUEUE_SPECIAL; /* * __schedule() ttwu() @@ -6471,17 +6621,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * After this, schedule() must not care about p->state any more. */ - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - - if (prev->in_iowait) { - atomic_inc(&rq->nr_iowait); - delayacct_blkio_start(); - } + block_task(rq, prev, flags); } switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); +picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG @@ -6523,7 +6669,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); + trace_sched_switch(preempt, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6599,7 +6745,7 @@ static void sched_update_worker(struct task_struct *tsk) } } -static __always_inline void __schedule_loop(unsigned int sched_mode) +static __always_inline void __schedule_loop(int sched_mode) { do { preempt_disable(); @@ -6644,7 +6790,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(SM_NONE); + __schedule(SM_IDLE); } while (need_resched()); } @@ -6658,7 +6804,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger + * should warn if prev_state != CT_STATE_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); @@ -6870,6 +7016,10 @@ void __setscheduler_prio(struct task_struct *p, int prio) p->sched_class = &dl_sched_class; else if (rt_prio(prio)) p->sched_class = &rt_sched_class; +#ifdef CONFIG_SCHED_CLASS_EXT + else if (task_should_scx(p)) + p->sched_class = &ext_sched_class; +#endif else p->sched_class = &fair_sched_class; @@ -7015,6 +7165,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); + check_class_changing(rq, p, prev_class); if (queued) enqueue_task(rq, p, queue_flag); @@ -7405,7 +7556,7 @@ EXPORT_SYMBOL(io_schedule); void sched_show_task(struct task_struct *p) { - unsigned long free = 0; + unsigned long free; int ppid; if (!try_get_task_stack(p)) @@ -7415,9 +7566,7 @@ void sched_show_task(struct task_struct *p) if (task_is_running(p)) pr_cont(" running task "); -#ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); -#endif ppid = 0; rcu_read_lock(); if (pid_alive(p)) @@ -7429,6 +7578,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); + print_scx_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -7957,6 +8107,8 @@ int sched_cpu_activate(unsigned int cpu) cpuset_cpu_active(); } + scx_rq_activate(rq); + /* * Put the rq online, if not already. This happens: * @@ -8006,6 +8158,8 @@ int sched_cpu_deactivate(unsigned int cpu) sched_set_rq_offline(rq, cpu); + scx_rq_deactivate(rq); + /* * When going down, decrement the number of cores with SMT present. */ @@ -8190,11 +8344,15 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class != &fair_sched_class + 1 || - &fair_sched_class != &rt_sched_class + 1 || - &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class != &stop_sched_class + 1); + BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); +#endif + BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); + BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); + BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); +#ifdef CONFIG_SCHED_CLASS_EXT + BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif wait_bit_init(); @@ -8218,6 +8376,9 @@ void __init sched_init(void) root_task_group.shares = ROOT_TASK_GROUP_LOAD; init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -8228,8 +8389,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } - init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_SMP init_defrootdomain(); #endif @@ -8284,8 +8443,13 @@ void __init sched_init(void) init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED + /* + * This is required for init cpu because rt.c:__enable_runtime() + * starts working after scheduler_running, which is not the case + * yet. + */ + rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif #ifdef CONFIG_SMP @@ -8317,10 +8481,12 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq); #ifdef CONFIG_SCHED_CORE rq->core = rq; rq->core_pick = NULL; + rq->core_dl_server = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; @@ -8333,6 +8499,7 @@ void __init sched_init(void) } set_load_weight(&init_task, false); + init_task.se.slice = sysctl_sched_base_slice, /* * The boot idle thread does lazy MMU switching as well: @@ -8363,6 +8530,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); + init_sched_ext_class(); psi_init(); @@ -8548,7 +8716,7 @@ void normalize_rt_tasks(void) schedstat_set(p->stats.sleep_start, 0); schedstat_set(p->stats.block_start, 0); - if (!dl_task(p) && !rt_task(p)) { + if (!rt_or_dl_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: @@ -8648,6 +8816,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -8775,6 +8944,7 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); + scx_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -8789,11 +8959,6 @@ void sched_move_task(struct task_struct *tsk) } } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -8817,6 +8982,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); @@ -8831,6 +9001,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -8848,9 +9025,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; @@ -8858,9 +9035,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } - return 0; -} #endif + return scx_cgroup_can_attach(tset); +} static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -8869,6 +9046,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + scx_cgroup_finish_attach(); +} + +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -9045,22 +9229,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) } #endif /* CONFIG_UCLAMP_TASK_GROUP */ +#ifdef CONFIG_GROUP_SCHED_WEIGHT +static unsigned long tg_weight(struct task_group *tg) +{ #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif +} + static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - - return (u64) scale_load_down(tg->shares); + return tg_weight(css_tg(css)); } +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); @@ -9406,7 +9604,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, @@ -9434,7 +9631,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -9444,12 +9641,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, static int cpu_idle_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 idle) { - return sched_group_set_idle(css_tg(css), idle); + int ret; + + ret = sched_group_set_idle(css_tg(css), idle); + if (!ret) + scx_group_set_idle(css_tg(css), idle); + return ret; } #endif static struct cftype cpu_legacy_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT { .name = "shares", .read_u64 = cpu_shares_read_u64, @@ -9559,38 +9761,35 @@ static int cpu_local_stat_show(struct seq_file *sf, return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - u64 weight = scale_load_down(tg->shares); - - return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); + return sched_weight_to_cgroup(tg_weight(css_tg(css))); } static int cpu_weight_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 weight) + struct cftype *cft, u64 cgrp_weight) { - /* - * cgroup weight knobs should use the common MIN, DFL and MAX - * values which are 1, 100 and 10000 respectively. While it loses - * a bit of range on both ends, it maps pretty well onto the shares - * value used by scheduler and the round-trip conversions preserve - * the original value over the entire range. - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; + int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - unsigned long weight = scale_load_down(css_tg(css)->shares); + unsigned long weight = tg_weight(css_tg(css)); int last_delta = INT_MAX; int prio, delta; @@ -9609,7 +9808,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -9618,9 +9817,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, long period, long quota) @@ -9680,7 +9883,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, #endif static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, @@ -9734,14 +9937,14 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, -#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, -#endif .attach = cpu_cgroup_attach, + .cancel_attach = cpu_cgroup_cancel_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, .early_init = true, @@ -9752,7 +9955,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { - if (cpu == smp_processor_id() && in_hardirq()) { + if (in_hardirq() && cpu == smp_processor_id()) { struct pt_regs *regs; regs = get_irq_regs(); @@ -10331,3 +10534,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif + +#ifdef CONFIG_SCHED_CLASS_EXT +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + *ctx = (struct sched_enq_and_set_ctx){ + .p = p, + .queue_flags = queue_flags, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + update_rq_clock(rq); + if (ctx->queued) + dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + if (ctx->running) + put_prev_task(rq, p); +} + +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(ctx->p); + + lockdep_assert_rq_held(rq); + + if (ctx->queued) + enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + if (ctx->running) + set_next_task(rq, ctx->p); +} +#endif /* CONFIG_SCHED_CLASS_EXT */ |