diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 289 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 3 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 57 | ||||
-rw-r--r-- | kernel/sched/debug.c | 7 | ||||
-rw-r--r-- | kernel/sched/ext.c | 975 | ||||
-rw-r--r-- | kernel/sched/fair.c | 42 | ||||
-rw-r--r-- | kernel/sched/features.h | 3 | ||||
-rw-r--r-- | kernel/sched/idle.c | 5 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 67 | ||||
-rw-r--r-- | kernel/sched/sched.h | 155 | ||||
-rw-r--r-- | kernel/sched/stats.h | 29 | ||||
-rw-r--r-- | kernel/sched/syscalls.c | 46 | ||||
-rw-r--r-- | kernel/sched/wait_bit.c | 90 |
14 files changed, 1139 insertions, 631 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1c353a62c56..95e40895a519 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -832,7 +832,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } @@ -1076,28 +1075,70 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; + + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); } void resched_cpu(int cpu) @@ -1192,7 +1233,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -1399,7 +1440,7 @@ void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -2024,10 +2065,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); - if (!(flags & ENQUEUE_RESTORE)) { + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) sched_info_enqueue(rq, p); - psi_enqueue(p, flags & ENQUEUE_MIGRATED); - } if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); @@ -2044,10 +2085,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeue(rq, p); - psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); - } + + psi_dequeue(p, flags); /* * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' @@ -2135,16 +2176,18 @@ void check_class_changed(struct rq *rq, struct task_struct *p, void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { - if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + struct task_struct *donor = rq->donor; + + if (p->sched_class == donor->sched_class) + donor->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, donor->sched_class)) resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } @@ -2620,9 +2663,7 @@ int push_cpu_stop(void *arg) // XXX validate p is still the highest prio task if (task_rq(p) == rq) { - deactivate_task(rq, p, 0); - set_task_cpu(p, lowest_rq->cpu); - activate_task(lowest_rq, p, 0); + move_queued_task_locked(rq, lowest_rq, p); resched_curr(lowest_rq); } @@ -2682,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) { /* @@ -2696,6 +2737,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -3308,9 +3350,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); + move_queued_task_locked(src_rq, dst_rq, p); wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -4424,7 +4464,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -5517,7 +5558,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && task_on_rq_queued(p)) { + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); @@ -5585,7 +5626,8 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; + /* accounting goes to the donor task */ + struct task_struct *donor; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5596,19 +5638,23 @@ void sched_tick(void) sched_clock_tick(); rq_lock(rq, &rf); + donor = rq->donor; - curr = rq->curr; - psi_account_irqtime(rq, curr, NULL); + psi_account_irqtime(rq, donor, NULL); update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); - curr->sched_class->task_tick(rq, curr, 0); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + + donor->sched_class->task_tick(rq, donor, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, curr); + task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5618,8 +5664,8 @@ void sched_tick(void) perf_event_task_tick(); - if (curr->flags & PF_WQ_WORKER) - wq_worker_tick(curr); + if (donor->flags & PF_WQ_WORKER) + wq_worker_tick(donor); #ifdef CONFIG_SMP if (!scx_switched_all()) { @@ -5686,6 +5732,12 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr = rq->curr; if (cpu_online(cpu)) { + /* + * Since this is a remote tick for full dynticks mode, + * we are always sure that there is no proxy (only a + * single task is running). + */ + SCHED_WARN_ON(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -6309,10 +6361,7 @@ static bool try_steal_cookie(int this, int that) if (sched_task_is_throttled(p, this)) goto next; - deactivate_task(src, p, 0); - set_task_cpu(p, this); - activate_task(dst, p, 0); - + move_queued_task_locked(src, dst, p); resched_curr(dst); success = true; @@ -6507,6 +6556,45 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #define SM_RTLOCK_WAIT 2 /* + * Helper function for __schedule() + * + * If a task does not have signals pending, deactivate it + * Otherwise marks the task's __state as RUNNING + */ +static bool try_to_block_task(struct rq *rq, struct task_struct *p, + unsigned long task_state) +{ + int flags = DEQUEUE_NOCLOCK; + + if (signal_pending_state(task_state, p)) { + WRITE_ONCE(p->__state, TASK_RUNNING); + return false; + } + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + block_task(rq, p, flags); + return true; +} + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -6614,37 +6702,12 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - if (signal_pending_state(prev_state, prev)) { - WRITE_ONCE(prev->__state, TASK_RUNNING); - } else { - int flags = DEQUEUE_NOCLOCK; - - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev_state & TASK_FROZEN); - - if (unlikely(is_special_task_state(prev_state))) - flags |= DEQUEUE_SPECIAL; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - block_task(rq, prev, flags); - block = true; - } + block = try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); + rq_set_donor(rq, next); picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -7151,7 +7214,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flag); if (running) @@ -7351,6 +7414,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * VOLUNTARY: * cond_resched <- __cond_resched @@ -7358,6 +7422,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * FULL: * cond_resched <- RET0 @@ -7365,6 +7430,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true */ enum { @@ -7372,30 +7446,41 @@ enum { preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, + preempt_dynamic_lazy, }; int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { +#ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; +#endif if (!strcmp(str, "full")) return preempt_dynamic_full; +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif + return -EINVAL; } +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif @@ -7415,6 +7500,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); switch (mode) { case preempt_dynamic_none: @@ -7424,6 +7510,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); break; @@ -7435,6 +7522,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n"); break; @@ -7446,9 +7534,22 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); break; + + case preempt_dynamic_lazy: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } preempt_dynamic_mode = mode; @@ -7511,6 +7612,8 @@ static void __init preempt_dynamic_init(void) sched_dynamic_update(preempt_dynamic_none); } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); } else { /* Default static call setting, nothing to do */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); @@ -7531,6 +7634,7 @@ static void __init preempt_dynamic_init(void) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ @@ -7683,8 +7787,6 @@ void __init init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -7699,10 +7801,8 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); #endif @@ -7721,6 +7821,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->idle = idle; + rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP @@ -7810,7 +7911,7 @@ void sched_setnuma(struct task_struct *p, int nid) rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); @@ -8546,6 +8647,7 @@ void __init sched_init(void) * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; @@ -8960,7 +9062,7 @@ void sched_move_task(struct task_struct *tsk) update_rq_clock(rq); - running = task_current(rq, tsk); + running = task_current_donor(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) @@ -10253,6 +10355,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); return src_cid; } @@ -10265,7 +10368,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; - int src_cid, dst_cid, src_cpu; + int src_cid, src_cpu; + bool dst_cid_is_set; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); @@ -10282,9 +10386,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allocation closest to 0 in cases where few threads migrate around * many CPUs. * - * If destination cid is already set, we may have to just clear - * the src cid to ensure compactness in frequent migrations - * scenarios. + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. * * It is not useful to clear the src cid when the number of threads is * greater or equal to the number of allowed CPUs, because user-space @@ -10292,9 +10396,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid = READ_ONCE(dst_pcpu_cid->cid); - if (!mm_cid_is_unset(dst_cid) && - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); @@ -10305,13 +10409,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) src_cid); if (src_cid == -1) return; - if (!mm_cid_is_unset(dst_cid)) { + if (dst_cid_is_set) { __mm_cid_put(mm, src_cid); return; } /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); } static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, @@ -10550,7 +10655,7 @@ void sched_mm_cid_after_execve(struct task_struct *t) * Matches barrier in sched_mm_cid_remote_clear_old(). */ smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c6ba15388ea7..28c77904ea74 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -783,9 +783,8 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - sugov_eas_rebuild_sd(); - out: + sugov_eas_rebuild_sd(); mutex_unlock(&global_tunables_lock); return 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index be1b917dc8ce..d9d5a702f1a6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1339,7 +1339,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1736,11 +1736,11 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio */ static void update_curr_dl(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; s64 delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) + if (!dl_task(donor) || !on_dl_rq(dl_se)) return; /* @@ -2213,7 +2213,7 @@ static int find_later_rq(struct task_struct *task); static int select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; bool select_rq; struct rq *rq; @@ -2224,6 +2224,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -2234,9 +2235,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - select_rq = unlikely(dl_task(curr)) && + select_rq = unlikely(dl_task(donor)) && (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && + !dl_entity_preempt(&p->dl, &donor->dl)) && p->nr_cpus_allowed > 1; /* @@ -2299,7 +2300,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -2338,7 +2339,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -2348,7 +2349,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ @@ -2380,7 +2381,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); @@ -2487,14 +2488,6 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -2513,7 +2506,7 @@ next_node: if (next_node) { p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); @@ -2707,8 +2700,8 @@ retry: * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; @@ -2751,9 +2744,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - activate_task(later_rq, next_task, 0); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2833,15 +2824,13 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); dmin = p->dl.deadline; resched = true; } @@ -2874,9 +2863,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } @@ -3051,12 +3040,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -3085,7 +3074,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!rq->dl.overloaded) deadline_queue_pull_task(rq); - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f4035c7a0fa1..a48b2a701ec2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { static const char * preempt_modes[] = { - "none", "voluntary", "full" + "none", "voluntary", "full", "lazy", }; - int i; + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + for (; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 751d73d500e5..7fff1d045477 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -199,8 +199,10 @@ struct scx_dump_ctx { /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * - * Userland can implement an arbitrary scheduling policy by implementing and - * loading operations in this table. + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. */ struct sched_ext_ops { /** @@ -218,10 +220,15 @@ struct sched_ext_ops { * dispatch. While an explicit custom mechanism can be added, * select_cpu() serves as the default way to wake up idle CPUs. * - * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p - * is dispatched, the ops.enqueue() callback will be skipped. Finally, - * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the - * local DSQ of whatever CPU is returned by this callback. + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. */ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); @@ -230,12 +237,12 @@ struct sched_ext_ops { * @p: task being enqueued * @enq_flags: %SCX_ENQ_* * - * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() - * or enqueue on the BPF scheduler. If not directly dispatched, the bpf - * scheduler owns @p and if it fails to dispatch @p, the task will - * stall. + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. * - * If @p was dispatched from ops.select_cpu(), this callback is + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is * skipped. */ void (*enqueue)(struct task_struct *p, u64 enq_flags); @@ -257,17 +264,17 @@ struct sched_ext_ops { void (*dequeue)(struct task_struct *p, u64 deq_flags); /** - * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs + * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs * @cpu: CPU to dispatch tasks for * @prev: previous task being switched out * * Called when a CPU's local dsq is empty. The operation should dispatch * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using - * scx_bpf_consume(). + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). * - * The maximum number of times scx_bpf_dispatch() can be called without - * an intervening scx_bpf_consume() is specified by + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by * ops.dispatch_max_batch. See the comments on top of the two functions * for more details. * @@ -275,7 +282,7 @@ struct sched_ext_ops { * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in * @prev->scx.flags, it is not enqueued yet and will be enqueued after * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. */ void (*dispatch)(s32 cpu, struct task_struct *prev); @@ -594,7 +601,7 @@ struct sched_ext_ops { * Update @tg's weight to @weight. */ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); -#endif /* CONFIG_CGROUPS */ +#endif /* CONFIG_EXT_GROUP_SCHED */ /* * All online ops must come before ops.cpu_online(). @@ -707,7 +714,7 @@ enum scx_enq_flags { /* * Set the following to trigger preemption when calling - * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the * current task is cleared to zero and the CPU is kicked into the * scheduling path. Implies %SCX_ENQ_HEAD. */ @@ -862,8 +869,9 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static unsigned long scx_in_softlockup; +static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); static int scx_ops_bypass_depth; -static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -876,6 +884,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); +#ifdef CONFIG_SMP +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); +#endif + static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -2309,7 +2322,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline - * between scx_bpf_dispatch() calls and here. Trigger error iff the + * between scx_bpf_dsq_insert() calls and here. Trigger error iff the * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { @@ -2397,11 +2410,115 @@ static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *r static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ +/** + * move_task_between_dsqs() - Move a task from one DSQ to another + * @p: target task + * @enq_flags: %SCX_ENQ_* + * @src_dsq: DSQ @p is currently on, must not be a local DSQ + * @dst_dsq: DSQ @p is being moved to, can be any DSQ + * + * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local + * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq + * will change. As @p's task_rq is locked, this function doesn't need to use the + * holding_cpu mechanism. + * + * On return, @src_dsq is unlocked and only @p's new task_rq, which is the + * return value, is locked. + */ +static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, + struct scx_dispatch_q *src_dsq, + struct scx_dispatch_q *dst_dsq) +{ + struct rq *src_rq = task_rq(p), *dst_rq; + + BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); + lockdep_assert_held(&src_dsq->lock); + lockdep_assert_rq_held(src_rq); + + if (dst_dsq->id == SCX_DSQ_LOCAL) { + dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + dst_dsq = find_global_dsq(p); + dst_rq = src_rq; + } + } else { + /* no need to migrate if destination is a non-local DSQ */ + dst_rq = src_rq; + } + + /* + * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different + * CPU, @p will be migrated. + */ + if (dst_dsq->id == SCX_DSQ_LOCAL) { + /* @p is going from a non-local DSQ to a local DSQ */ + if (src_rq == dst_rq) { + task_unlink_from_dsq(p, src_dsq); + move_local_task_to_local_dsq(p, enq_flags, + src_dsq, dst_rq); + raw_spin_unlock(&src_dsq->lock); + } else { + raw_spin_unlock(&src_dsq->lock); + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + } + } else { + /* + * @p is going from a non-local DSQ to a non-local DSQ. As + * $src_dsq is already locked, do an abbreviated dequeue. + */ + task_unlink_from_dsq(p, src_dsq); + p->scx.dsq = NULL; + raw_spin_unlock(&src_dsq->lock); + + dispatch_enqueue(dst_dsq, p, enq_flags); + } + + return dst_rq; +} + +/* + * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly + * banging on the same DSQ on a large NUMA system to the point where switching + * to the bypass mode can take a long time. Inject artifical delays while the + * bypass mode is switching to guarantee timely completion. + */ +static void scx_ops_breather(struct rq *rq) +{ + u64 until; + + lockdep_assert_rq_held(rq); + + if (likely(!atomic_read(&scx_ops_breather_depth))) + return; + + raw_spin_rq_unlock(rq); + + until = ktime_get_ns() + NSEC_PER_MSEC; + + do { + int cnt = 1024; + while (atomic_read(&scx_ops_breather_depth) && --cnt) + cpu_relax(); + } while (atomic_read(&scx_ops_breather_depth) && + time_before64(ktime_get_ns(), until)); + + raw_spin_rq_lock(rq); +} + static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* + * This retry loop can repeatedly race against scx_ops_bypass() + * dequeueing tasks from @dsq trying to put the system into the bypass + * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can + * live-lock the machine into soft lockups. Give a breather. + */ + scx_ops_breather(rq); + + /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -2541,7 +2658,7 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * Dispatching to local DSQs may need to wait for queueing to complete or * require rq lock dancing. As we don't wanna do either while inside * ops.dispatch() to avoid locking order inversion, we split dispatching into - * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the + * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the * task and its qseq. Once ops.dispatch() returns, this function is called to * finish up. * @@ -2573,7 +2690,7 @@ retry: /* * If qseq doesn't match, @p has gone through at least one * dispatch/dequeue and re-enqueue cycle between - * scx_bpf_dispatch() and here and we have no claim on it. + * scx_bpf_dsq_insert() and here and we have no claim on it. */ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) return; @@ -2642,7 +2759,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * If the previous sched_class for the current CPU was not SCX, * notify the BPF scheduler that it again has control of the * core. This callback complements ->cpu_release(), which is - * emitted in scx_next_task_picked(). + * emitted in switch_class(). */ if (SCX_HAS_OP(cpu_acquire)) SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); @@ -3098,28 +3215,216 @@ found: goto retry; } +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) { + const struct cpumask *numa_cpus; + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return true; + + numa_cpus = cpumask_of_node(cpu_to_node(cpu)); + if (sd->span_weight != cpumask_weight(numa_cpus)) + return true; + } + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +static void update_selcpu_topology(void) +{ + bool enable_llc = false, enable_numa = false; + struct sched_domain *sd; + const struct cpumask *cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (sd) { + if (sd->span_weight < num_online_cpus()) + enable_llc = true; + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + */ + cpus = cpumask_of_node(cpu_to_node(cpu)); + if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch()) + enable_numa = true; + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + enable_llc ? "enabled" : "disabled"); + pr_debug("sched_ext: NUMA idle selection %s\n", + enable_numa ? "enabled" : "disabled"); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * Step 3 and 4 are performed only if the system has, respectively, multiple + * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa). + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found) { + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; s32 cpu; *found = false; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu)); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, prev_cpu)); + if (sd) + llc_cpus = sched_domain_span(sd); + } + } + /* - * If WAKE_SYNC, the waker's local DSQ is empty, and the system is - * under utilized, wake up @p to the local DSQ of the waker. Checking - * only for an empty local DSQ is insufficient as it could give the - * wakee an unfair advantage when the system is oversaturated. - * Checking only for the presence of idle CPUs is also insufficient as - * the local DSQ of the waker could have tasks piled up on it even if - * there is an idle core elsewhere on the system. - */ - cpu = smp_processor_id(); - if ((wake_flags & SCX_WAKE_SYNC) && - !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + cpu = smp_processor_id(); + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + if (cpus_share_cache(cpu, prev_cpu) && + test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; goto cpu_found; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + if (!cpumask_empty(idle_masks.cpu) && + !(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; + } } /* @@ -3127,29 +3432,80 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, * partially idle @prev_cpu. */ if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; goto cpu_found; } + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any full idle core usable by the task. + */ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); if (cpu >= 0) goto cpu_found; } + /* + * Use @prev_cpu if it's idle. + */ if (test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; goto cpu_found; } + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU usable by the task. + */ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); if (cpu >= 0) goto cpu_found; + rcu_read_unlock(); return prev_cpu; cpu_found: + rcu_read_unlock(); + *found = true; return cpu; } @@ -3272,6 +3628,9 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); + if (scx_enabled()) + update_selcpu_topology(); + if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) @@ -3567,12 +3926,7 @@ static void scx_ops_exit_task(struct task_struct *p) void init_scx_entity(struct sched_ext_entity *scx) { - /* - * init_idle() calls this function again after fork sequence is - * complete. Don't touch ->tasks_node as it's already linked. - */ - memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - + memset(scx, 0, sizeof(*scx)); INIT_LIST_HEAD(&scx->dsq_list.node); RB_CLEAR_NODE(&scx->dsq_priq); scx->sticky_cpu = -1; @@ -4286,6 +4640,49 @@ bool task_should_scx(int policy) } /** + * scx_softlockup - sched_ext softlockup handler + * + * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can + * live-lock the system by making many CPUs target the same DSQ to the point + * where soft-lockup detection triggers. This function is called from + * soft-lockup watchdog when the triggering point is close and tries to unjam + * the system by enabling the breather and aborting the BPF scheduler. + */ +void scx_softlockup(u32 dur_s) +{ + switch (scx_ops_enable_state()) { + case SCX_OPS_ENABLING: + case SCX_OPS_ENABLED: + break; + default: + return; + } + + /* allow only one instance, cleared at the end of scx_ops_bypass() */ + if (test_and_set_bit(0, &scx_in_softlockup)) + return; + + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", + smp_processor_id(), dur_s, scx_ops.name); + + /* + * Some CPUs may be trapped in the dispatch paths. Enable breather + * immediately; otherwise, we might even be able to get to + * scx_ops_bypass(). + */ + atomic_inc(&scx_ops_breather_depth); + + scx_ops_error("soft lockup - CPU#%d stuck for %us", + smp_processor_id(), dur_s); +} + +static void scx_clear_softlockup(void) +{ + if (test_and_clear_bit(0, &scx_in_softlockup)) + atomic_dec(&scx_ops_breather_depth); +} + +/** * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress * * Bypassing guarantees that all runnable tasks make forward progress without @@ -4317,10 +4714,11 @@ bool task_should_scx(int policy) */ static void scx_ops_bypass(bool bypass) { + static DEFINE_RAW_SPINLOCK(bypass_lock); int cpu; unsigned long flags; - raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); + raw_spin_lock_irqsave(&bypass_lock, flags); if (bypass) { scx_ops_bypass_depth++; WARN_ON_ONCE(scx_ops_bypass_depth <= 0); @@ -4333,6 +4731,8 @@ static void scx_ops_bypass(bool bypass) goto unlock; } + atomic_inc(&scx_ops_breather_depth); + /* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() @@ -4388,8 +4788,11 @@ static void scx_ops_bypass(bool bypass) /* resched to restore ticks and idle state */ resched_cpu(cpu); } + + atomic_dec(&scx_ops_breather_depth); unlock: - raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); + raw_spin_unlock_irqrestore(&bypass_lock, flags); + scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) @@ -5100,6 +5503,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); +#ifdef CONFIG_SMP + update_selcpu_topology(); +#endif cpus_read_unlock(); ret = validate_ops(ops); @@ -5307,67 +5713,7 @@ err_disable: #include <linux/bpf.h> #include <linux/btf.h> -extern struct btf *btf_vmlinux; static const struct btf_type *task_struct_type; -static u32 task_struct_type_id; - -static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - struct btf *btf = bpf_get_btf_vmlinux(); - const struct bpf_struct_ops_desc *st_ops_desc; - const struct btf_member *member; - const struct btf_type *t; - u32 btf_id, member_idx; - const char *mname; - - /* struct_ops op args are all sequential, 64-bit numbers */ - if (off != arg_n * sizeof(__u64)) - return false; - - /* btf_id should be the type id of struct sched_ext_ops */ - btf_id = prog->aux->attach_btf_id; - st_ops_desc = bpf_struct_ops_find(btf, btf_id); - if (!st_ops_desc) - return false; - - /* BTF type of struct sched_ext_ops */ - t = st_ops_desc->type; - - member_idx = prog->expected_attach_type; - if (member_idx >= btf_type_vlen(t)) - return false; - - /* - * Get the member name of this struct_ops program, which corresponds to - * a field in struct sched_ext_ops. For example, the member name of the - * dispatch struct_ops program (callback) is "dispatch". - */ - member = &btf_type_member(t)[member_idx]; - mname = btf_name_by_offset(btf_vmlinux, member->name_off); - - if (!strcmp(mname, op)) { - /* - * The value is a pointer to a type (struct task_struct) given - * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), - * however, can be a NULL (PTR_MAYBE_NULL). The BPF program - * should check the pointer to make sure it is not NULL before - * using it, or the verifier will reject the program. - * - * Longer term, this is something that should be addressed by - * BTF, and be fully contained within the verifier. - */ - info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; - info->btf = btf_vmlinux; - info->btf_id = task_struct_type_id; - - return true; - } - - return false; -} static bool bpf_scx_is_valid_access(int off, int size, enum bpf_access_type type, @@ -5376,9 +5722,6 @@ static bool bpf_scx_is_valid_access(int off, int size, { if (type != BPF_READ) return false; - if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || - set_arg_maybe_null("yield", 1, off, size, type, prog, info)) - return true; if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (off % size != 0) @@ -5513,13 +5856,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link) static int bpf_scx_init(struct btf *btf) { - s32 type_id; - - type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); - if (type_id < 0) - return -EINVAL; - task_struct_type = btf_type_by_id(btf, type_id); - task_struct_type_id = type_id; + task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); return 0; } @@ -5541,78 +5878,78 @@ static int bpf_scx_validate(void *kdata) return 0; } -static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } -static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} -static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} -static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} -static void tick_stub(struct task_struct *p) {} -static void runnable_stub(struct task_struct *p, u64 enq_flags) {} -static void running_stub(struct task_struct *p) {} -static void stopping_stub(struct task_struct *p, bool runnable) {} -static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} -static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } -static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } -static void set_weight_stub(struct task_struct *p, u32 weight) {} -static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} -static void update_idle_stub(s32 cpu, bool idle) {} -static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} -static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} -static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } -static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} -static void enable_stub(struct task_struct *p) {} -static void disable_stub(struct task_struct *p) {} +static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } +static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} +static void sched_ext_ops__tick(struct task_struct *p) {} +static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__running(struct task_struct *p) {} +static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} +static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} +static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } +static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } +static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} +static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} +static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} +static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} +static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} +static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } +static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} +static void sched_ext_ops__enable(struct task_struct *p) {} +static void sched_ext_ops__disable(struct task_struct *p) {} #ifdef CONFIG_EXT_GROUP_SCHED -static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } -static void cgroup_exit_stub(struct cgroup *cgrp) {} -static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } -static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} +static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } +static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} +static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } +static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} #endif -static void cpu_online_stub(s32 cpu) {} -static void cpu_offline_stub(s32 cpu) {} -static s32 init_stub(void) { return -EINVAL; } -static void exit_stub(struct scx_exit_info *info) {} -static void dump_stub(struct scx_dump_ctx *ctx) {} -static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} -static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {} +static void sched_ext_ops__cpu_online(s32 cpu) {} +static void sched_ext_ops__cpu_offline(s32 cpu) {} +static s32 sched_ext_ops__init(void) { return -EINVAL; } +static void sched_ext_ops__exit(struct scx_exit_info *info) {} +static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} +static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} +static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} static struct sched_ext_ops __bpf_ops_sched_ext_ops = { - .select_cpu = select_cpu_stub, - .enqueue = enqueue_stub, - .dequeue = dequeue_stub, - .dispatch = dispatch_stub, - .tick = tick_stub, - .runnable = runnable_stub, - .running = running_stub, - .stopping = stopping_stub, - .quiescent = quiescent_stub, - .yield = yield_stub, - .core_sched_before = core_sched_before_stub, - .set_weight = set_weight_stub, - .set_cpumask = set_cpumask_stub, - .update_idle = update_idle_stub, - .cpu_acquire = cpu_acquire_stub, - .cpu_release = cpu_release_stub, - .init_task = init_task_stub, - .exit_task = exit_task_stub, - .enable = enable_stub, - .disable = disable_stub, + .select_cpu = sched_ext_ops__select_cpu, + .enqueue = sched_ext_ops__enqueue, + .dequeue = sched_ext_ops__dequeue, + .dispatch = sched_ext_ops__dispatch, + .tick = sched_ext_ops__tick, + .runnable = sched_ext_ops__runnable, + .running = sched_ext_ops__running, + .stopping = sched_ext_ops__stopping, + .quiescent = sched_ext_ops__quiescent, + .yield = sched_ext_ops__yield, + .core_sched_before = sched_ext_ops__core_sched_before, + .set_weight = sched_ext_ops__set_weight, + .set_cpumask = sched_ext_ops__set_cpumask, + .update_idle = sched_ext_ops__update_idle, + .cpu_acquire = sched_ext_ops__cpu_acquire, + .cpu_release = sched_ext_ops__cpu_release, + .init_task = sched_ext_ops__init_task, + .exit_task = sched_ext_ops__exit_task, + .enable = sched_ext_ops__enable, + .disable = sched_ext_ops__disable, #ifdef CONFIG_EXT_GROUP_SCHED - .cgroup_init = cgroup_init_stub, - .cgroup_exit = cgroup_exit_stub, - .cgroup_prep_move = cgroup_prep_move_stub, - .cgroup_move = cgroup_move_stub, - .cgroup_cancel_move = cgroup_cancel_move_stub, - .cgroup_set_weight = cgroup_set_weight_stub, + .cgroup_init = sched_ext_ops__cgroup_init, + .cgroup_exit = sched_ext_ops__cgroup_exit, + .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, + .cgroup_move = sched_ext_ops__cgroup_move, + .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, + .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, #endif - .cpu_online = cpu_online_stub, - .cpu_offline = cpu_offline_stub, - .init = init_stub, - .exit = exit_stub, - .dump = dump_stub, - .dump_cpu = dump_cpu_stub, - .dump_task = dump_task_stub, + .cpu_online = sched_ext_ops__cpu_online, + .cpu_offline = sched_ext_ops__cpu_offline, + .init = sched_ext_ops__init, + .exit = sched_ext_ops__exit, + .dump = sched_ext_ops__dump, + .dump_cpu = sched_ext_ops__dump_cpu, + .dump_task = sched_ext_ops__dump_task, }; static struct bpf_struct_ops bpf_sched_ext_ops = { @@ -5759,7 +6096,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) if (cpu != cpu_of(this_rq)) { /* * Pairs with smp_store_release() issued by this CPU in - * scx_next_task_picked() on the resched path. + * switch_class() on the resched path. * * We busy-wait here to guarantee that no other task can * be scheduled on our core before the target CPU has @@ -5944,7 +6281,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { .set = &scx_kfunc_ids_select_cpu, }; -static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; @@ -5964,7 +6301,8 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) return true; } -static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) +static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, + u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; @@ -5991,14 +6329,14 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags __bpf_kfunc_start_defs(); /** - * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ - * @p: task_struct to dispatch - * @dsq_id: DSQ to dispatch to + * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @enq_flags: SCX_ENQ_* * - * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe - * to call this function spuriously. Can be called from ops.enqueue(), + * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to + * call this function spuriously. Can be called from ops.enqueue(), * ops.select_cpu(), and ops.dispatch(). * * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch @@ -6007,14 +6345,14 @@ __bpf_kfunc_start_defs(); * ops.select_cpu() to be on the target CPU in the first place. * * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p - * will be directly dispatched to the corresponding dispatch queue after - * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be - * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). + * will be directly inserted into the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be + * inserted into the local DSQ of the CPU returned by ops.select_cpu(). * @enq_flags are OR'd with the enqueue flags on the enqueue path before the - * task is dispatched. + * task is inserted. * * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id - * and this function can be called upto ops.dispatch_max_batch times to dispatch + * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. * @@ -6026,10 +6364,10 @@ __bpf_kfunc_start_defs(); * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling. */ -__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) { - if (!scx_dispatch_preamble(p, enq_flags)) + if (!scx_dsq_insert_preamble(p, enq_flags)) return; if (slice) @@ -6037,30 +6375,42 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, else p->scx.slice = p->scx.slice ?: 1; - scx_dispatch_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); + scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); } /** - * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ - * @p: task_struct to dispatch - * @dsq_id: DSQ to dispatch to + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ * @enq_flags: SCX_ENQ_* * - * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. - * Tasks queued into the priority queue are ordered by @vtime and always - * consumed after the tasks in the FIFO queue. All other aspects are identical - * to scx_bpf_dispatch(). + * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime. All other aspects + * are identical to scx_bpf_dsq_insert(). * * @vtime ordering is according to time_before64() which considers wrapping. A * numerically larger vtime may indicate an earlier position in the ordering and * vice-versa. + * + * A DSQ can only be used as a FIFO or priority queue at any given time and this + * function must not be called on a DSQ which already has one or more FIFO tasks + * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and + * SCX_DSQ_GLOBAL) cannot be used as priority queues. */ -__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dispatch_preamble(p, enq_flags)) + if (!scx_dsq_insert_preamble(p, enq_flags)) return; if (slice) @@ -6070,12 +6420,22 @@ __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); + scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) @@ -6085,12 +6445,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { .set = &scx_kfunc_ids_enqueue_dispatch, }; -static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, + struct task_struct *p, u64 dsq_id, u64 enq_flags) { struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; - struct rq *this_rq, *src_rq, *dst_rq, *locked_rq; + struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; @@ -6118,6 +6477,13 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, raw_spin_rq_lock(src_rq); } + /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). Insert a + * breather if necessary. + */ + scx_ops_breather(src_rq); + locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -6136,51 +6502,18 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, /* @p is still on $src_dsq and stable, determine the destination */ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); - if (dst_dsq->id == SCX_DSQ_LOCAL) { - dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { - dst_dsq = find_global_dsq(p); - dst_rq = src_rq; - } - } else { - /* no need to migrate if destination is a non-local DSQ */ - dst_rq = src_rq; - } - /* - * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different - * CPU, @p will be migrated. + * Apply vtime and slice updates before moving so that the new time is + * visible before inserting into $dst_dsq. @p is still on $src_dsq but + * this is safe as we're locking it. */ - if (dst_dsq->id == SCX_DSQ_LOCAL) { - /* @p is going from a non-local DSQ to a local DSQ */ - if (src_rq == dst_rq) { - task_unlink_from_dsq(p, src_dsq); - move_local_task_to_local_dsq(p, enq_flags, - src_dsq, dst_rq); - raw_spin_unlock(&src_dsq->lock); - } else { - raw_spin_unlock(&src_dsq->lock); - move_remote_task_to_local_dsq(p, enq_flags, - src_rq, dst_rq); - locked_rq = dst_rq; - } - } else { - /* - * @p is going from a non-local DSQ to a non-local DSQ. As - * $src_dsq is already locked, do an abbreviated dequeue. - */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; - raw_spin_unlock(&src_dsq->lock); - - if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) - p->scx.dsq_vtime = kit->vtime; - dispatch_enqueue(dst_dsq, p, enq_flags); - } - + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) + p->scx.dsq_vtime = kit->vtime; if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) p->scx.slice = kit->slice; + /* execute move */ + locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq); dispatched = true; out: if (in_balance) { @@ -6232,21 +6565,20 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) } /** - * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ - * @dsq_id: DSQ to consume + * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to move task from * - * Consume a task from the non-local DSQ identified by @dsq_id and transfer it - * to the current CPU's local DSQ for execution. Can only be called from - * ops.dispatch(). + * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's + * local DSQ for execution. Can only be called from ops.dispatch(). * - * This function flushes the in-flight dispatches from scx_bpf_dispatch() before - * trying to consume the specified DSQ. It may also grab rq locks and thus can't - * be called under any BPF locks. + * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() + * before trying to move from the specified DSQ. It may also grab rq locks and + * thus can't be called under any BPF locks. * - * Returns %true if a task has been consumed, %false if there isn't any task to - * consume. + * Returns %true if a task has been moved, %false if there isn't any task to + * move. */ -__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; @@ -6276,17 +6608,24 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) } } +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); + return scx_bpf_dsq_move_to_local(dsq_id); +} + /** - * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ + * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress - * @slice: duration the dispatched task can run for in nsecs + * @slice: duration the moved task can run for in nsecs * - * Override the slice of the next task that will be dispatched from @it__iter - * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called, - * the previous slice duration is kept. + * Override the slice of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous + * slice duration is kept. */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( - struct bpf_iter_scx_dsq *it__iter, u64 slice) +__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, + u64 slice) { struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; @@ -6294,18 +6633,26 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; } +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( + struct bpf_iter_scx_dsq *it__iter, u64 slice) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); + scx_bpf_dsq_move_set_slice(it__iter, slice); +} + /** - * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ + * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ * - * Override the vtime of the next task that will be dispatched from @it__iter - * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the - * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to - * dispatch the next task, the override is ignored and cleared. + * Override the vtime of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice + * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the + * override is ignored and cleared. */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( - struct bpf_iter_scx_dsq *it__iter, u64 vtime) +__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, + u64 vtime) { struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; @@ -6313,8 +6660,16 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; } +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( + struct bpf_iter_scx_dsq *it__iter, u64 vtime) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); + scx_bpf_dsq_move_set_vtime(it__iter, vtime); +} + /** - * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ + * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress * @p: task to transfer * @dsq_id: DSQ to move @p to @@ -6329,8 +6684,7 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have * been queued before the iteration started. * - * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to - * update. + * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. * * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). @@ -6338,16 +6692,25 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( * Returns %true if @p has been consumed, %false if @p had already been consumed * or dequeued. */ +__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, - p, dsq_id, enq_flags); + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); + return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); } /** - * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ + * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress * @p: task to transfer * @dsq_id: DSQ to move @p to @@ -6357,19 +6720,27 @@ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, * priority queue of the DSQ specified by @dsq_id. The destination must be a * user DSQ as only user DSQs support priority queue. * - * @p's slice and vtime are kept by default. Use - * scx_bpf_dispatch_from_dsq_set_slice() and - * scx_bpf_dispatch_from_dsq_set_vtime() to update. + * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() + * and scx_bpf_dsq_move_set_vtime() to update. * - * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See - * scx_bpf_dispatch_vtime() for more information on @vtime. + * All other aspects are identical to scx_bpf_dsq_move(). See + * scx_bpf_dsq_insert_vtime() for more information on @vtime. */ +__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ __bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, - p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); + return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); } __bpf_kfunc_end_defs(); @@ -6377,7 +6748,12 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) @@ -6478,6 +6854,12 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) @@ -7153,15 +7535,8 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) goto out; - /* - * A task_group may either be a cgroup or an autogroup. In the latter - * case, @tg->css.cgroup is %NULL. A task_group can't become the other - * kind once created. - */ - if (tg && tg->css.cgroup) - cgrp = tg->css.cgroup; - else - cgrp = &cgrp_dfl_root.cgrp; + cgrp = tg_cgrp(tg); + out: cgroup_get(cgrp); return cgrp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d16c8545c71..fbdca89c677f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1200,12 +1200,12 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq, */ s64 update_curr_common(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - delta_exec = update_curr_se(rq, &curr->se); + delta_exec = update_curr_se(rq, &donor->se); if (likely(delta_exec > 0)) - update_curr_task(curr, delta_exec); + update_curr_task(donor, delta_exec); return delta_exec; } @@ -1251,14 +1251,14 @@ static void update_curr(struct cfs_rq *cfs_rq) return; if (resched || did_preempt_short(cfs_rq, curr)) { - resched_curr(rq); + resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void @@ -5280,7 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5678,15 +5678,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif } @@ -6822,7 +6816,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -6837,12 +6831,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -8763,9 +8757,9 @@ static void set_next_buddy(struct sched_entity *se) */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8794,7 +8788,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) return; if (!sched_feat(WAKEUP_PREEMPTION)) @@ -8842,7 +8836,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } static struct task_struct *pick_task_fair(struct rq *rq) @@ -13093,7 +13087,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -13200,7 +13194,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); else wakeup_preempt(rq, p, 0); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 290874079f60..a3d331dd2d8f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -19,7 +19,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) */ SCHED_FEAT(RUN_TO_PARITY, true) /* - * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for * current. */ SCHED_FEAT(PREEMPT_SHORT, true) @@ -56,7 +56,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) -SCHED_FEAT(DOUBLE_TICK, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d2f096bb274c..621696269584 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -271,7 +271,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - rmb(); /* * Interrupts shouldn't be re-enabled from that point on until @@ -399,8 +398,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - it.timer.function = idle_inject_timer_fn; + hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED_HARD); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a9c65d97b3ca..fc07382361a8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -476,7 +476,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) bool update_other_load_avgs(struct rq *rq) { u64 now = rq_clock_pelt(rq); - const struct sched_class *curr_class = rq->curr->sched_class; + const struct sched_class *curr_class = rq->donor->sched_class; unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); lockdep_assert_rq_held(rq); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 172c588de542..bd66a46b06ac 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -528,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; @@ -542,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, 0); - if (rt_rq->highest_prio.curr < curr->prio) + if (rt_rq->highest_prio.curr < donor->prio) resched_curr(rq); } } @@ -988,10 +988,10 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; delta_exec = update_curr_common(rq); @@ -999,7 +999,7 @@ static void update_curr_rt(struct rq *rq) return; #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity *rt_se = &curr->rt; + struct sched_rt_entity *rt_se = &donor->rt; if (!rt_bandwidth_enabled()) return; @@ -1535,7 +1535,7 @@ static int find_lowest_rq(struct task_struct *task); static int select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; bool test; @@ -1547,6 +1547,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1575,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * systems like big.LITTLE. */ test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1606,12 +1607,8 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - /* - * Current can't be migrated, useless to reschedule, - * let's hope p can move out. - */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; /* @@ -1654,7 +1651,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) */ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { resched_curr(rq); return; } @@ -1672,7 +1671,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } @@ -1697,7 +1696,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); @@ -1773,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise @@ -1795,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1968,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); @@ -2000,7 +1991,7 @@ retry: * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { + if (unlikely(next_task->prio < rq->donor->prio)) { resched_curr(rq); return 0; } @@ -2021,7 +2012,7 @@ retry: * Note that the stoppers are masqueraded as SCHED_FIFO * (cf. sched_set_stop_task()), so we can't rely on rt_task(). */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) return 0; cpu = find_lowest_rq(rq->curr); @@ -2088,9 +2079,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + move_queued_task_locked(rq, lowest_rq, next_task); resched_curr(lowest_rq); ret = 1; @@ -2355,15 +2344,13 @@ static void pull_rt_task(struct rq *this_rq) * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); resched = true; } /* @@ -2399,9 +2386,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && + (dl_task(rq->donor) || rt_task(rq->donor)) && (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio); + rq->donor->prio <= p->prio); if (need_to_push) push_rt_tasks(rq); @@ -2485,7 +2472,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } @@ -2500,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2526,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->donor->prio) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c03b3d7b320e..76f5f53a645f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1148,7 +1148,10 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1345,6 +1348,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -2086,34 +2094,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) #endif /* CONFIG_SMP */ -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); - -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ - -static inline void sched_core_account_forceidle(struct rq *rq) { } - -static inline void sched_core_tick(struct rq *rq) { } - -#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2261,11 +2241,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2452,7 +2446,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); prev->sched_class->put_prev_task(rq, prev, NULL); } @@ -2616,7 +2610,7 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node) static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2696,6 +2690,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); @@ -3200,6 +3195,34 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { @@ -3630,24 +3653,41 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid = __this_cpu_read(pcpu_cid->recent_cid); - cpumask = mm_cidmask(mm); + /* Try to re-use recent cid. This improves cache locality. */ + if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = atomic_read(&mm->max_nr_cid); + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } + /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; return cid; @@ -3665,7 +3705,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3675,13 +3716,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3701,7 +3742,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3718,7 +3759,8 @@ end: return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3735,8 +3777,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); return cid; } @@ -3789,7 +3832,7 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } #else /* !CONFIG_SCHED_MM_CID: */ @@ -3802,6 +3845,28 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif #ifdef CONFIG_RT_MUTEXES diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 767e098a3bd1..8ee0add5a48a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, * go through migration requeues. In this case, *sleeping* states need * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool migrate) +static inline void psi_enqueue(struct task_struct *p, int flags) { int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; + if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ - SCHED_WARN_ON(!migrate); + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; if (p->in_iowait) set |= TSK_IOWAIT; - } else if (migrate) { + } else if (flags & ENQUEUE_MIGRATED) { /* CPU migration of runnable task */ set = TSK_RUNNING; if (p->in_memstall) @@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate) psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool migrate) +static inline void psi_dequeue(struct task_struct *p, int flags) { if (static_branch_likely(&psi_disabled)) return; - /* - * When migrating a task to another CPU, clear all psi - * state. The enqueue callback above will work it out. - */ - if (migrate) - psi_task_change(p, p->psi_flags, 0); + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; /* * A voluntary sleep is a dequeue followed by a task switch. To @@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate) * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ + if (flags & DEQUEUE_SLEEP) + return; + + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 24f9f90b6574..0d71fcbaf1e3 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) @@ -713,7 +713,7 @@ change: dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flags); if (running) @@ -1081,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. @@ -1164,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, #endif } - return sched_attr_copy_to_user(uattr, &kattr, usize); + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } #ifdef CONFIG_SMP diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 134d7112ef71..b410b61cec95 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -9,7 +9,7 @@ static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; -wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; unsigned long val = (unsigned long)word << shift | bit; @@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ } EXPORT_SYMBOL(__wait_on_bit); -int __sched out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, + unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); @@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) EXPORT_SYMBOL(__wake_up_bit); /** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wake_up_bit - wake up waiters on a bit + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hash-table's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. + * Wake up any process waiting in wait_on_bit() or similar for the + * given bit to be cleared. * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address and bit will be woken, and only if the + * bit is clear. + * + * In order for this to function properly there must be a full memory + * barrier after the bit is cleared and before this function is called. + * If the bit was cleared atomically, such as a by clear_bit() then + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. + * If the bit was cleared with a fully-ordered operation, no further + * barrier is required. + * + * Normally the bit should be cleared by an operation with RELEASE + * semantics so that any changes to memory made before the bit is + * cleared are guaranteed to be visible after the matching wait_on_bit() + * completes. */ -void wake_up_bit(void *word, int bit) +void wake_up_bit(unsigned long *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } @@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int } EXPORT_SYMBOL(init_wait_var_entry); +/** + * wake_up_var - wake up waiters on a variable (kernel address) + * @var: the address of the variable being waited on + * + * Wake up any process waiting in wait_var_event() or similar for the + * given variable to change. wait_var_event() can be waiting for an + * arbitrary condition to be true and associates that condition with an + * address. Calling wake_up_var() suggests that the condition has been + * made true, but does not strictly require the condtion to use the + * address given. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address will be woken. + * + * In order for this to function properly there must be a full memory + * barrier after the variable is updated (or more accurately, after the + * condition waited on has been made to be true) and before this function + * is called. If the variable was updated atomically, such as a by + * atomic_dec() then smb_mb__after_atomic() can be used. If the + * variable was updated by a fully ordered operation such as + * atomic_dec_and_test() then no extra barrier is required. Otherwise + * smb_mb() is needed. + * + * Normally the variable should be updated (the condition should be made + * to be true) by an operation with RELEASE semantics such as + * smp_store_release() so that any changes to memory made before the + * variable was updated are guaranteed to be visible after the matching + * wait_var_event() completes. + */ void wake_up_var(void *var) { __wake_up_bit(__var_waitqueue(var), var, -1); @@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); - void __init wait_bit_init(void) { int i; |