aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c289
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c57
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/ext.c975
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/pelt.c2
-rw-r--r--kernel/sched/rt.c67
-rw-r--r--kernel/sched/sched.h155
-rw-r--r--kernel/sched/stats.h29
-rw-r--r--kernel/sched/syscalls.c46
-rw-r--r--kernel/sched/wait_bit.c90
14 files changed, 1139 insertions, 631 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a1c353a62c56..95e40895a519 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -832,7 +832,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
rq_lock(rq, &rf);
update_rq_clock(rq);
- rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ rq->donor->sched_class->task_tick(rq, rq->curr, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
@@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}
/*
@@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p)
}
#else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- set_tsk_need_resched(p);
+ set_ti_thread_flag(ti, tif);
return true;
}
@@ -1076,28 +1075,70 @@ void wake_up_q(struct wake_q_head *head)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
{
struct task_struct *curr = rq->curr;
+ struct thread_info *cti = task_thread_info(curr);
int cpu;
lockdep_assert_rq_held(rq);
- if (test_tsk_need_resched(curr))
+ /*
+ * Always immediately preempt the idle task; no point in delaying doing
+ * actual work.
+ */
+ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+ tif = TIF_NEED_RESCHED;
+
+ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_ti_thread_flag(cti, tif);
+ if (tif == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(cti, tif)) {
+ if (tif == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+ if (dynamic_preempt_lazy())
+ return TIF_NEED_RESCHED_LAZY;
+
+ return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ __resched_curr(rq, get_lazy_tif_bit());
}
void resched_cpu(int cpu)
@@ -1192,7 +1233,7 @@ static void wake_up_idle_cpu(int cpu)
* and testing of the above solutions didn't appear to report
* much benefits.
*/
- if (set_nr_and_not_polling(rq->idle))
+ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -1399,7 +1440,7 @@ void set_load_weight(struct task_struct *p, bool update_load)
* requests are serialized using a mutex to reduce the risk of conflicting
* updates or API abuses.
*/
-static DEFINE_MUTEX(uclamp_mutex);
+static __maybe_unused DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
@@ -2024,10 +2065,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p);
- if (!(flags & ENQUEUE_RESTORE)) {
+ psi_enqueue(p, flags);
+
+ if (!(flags & ENQUEUE_RESTORE))
sched_info_enqueue(rq, p);
- psi_enqueue(p, flags & ENQUEUE_MIGRATED);
- }
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
@@ -2044,10 +2085,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE)) {
+ if (!(flags & DEQUEUE_SAVE))
sched_info_dequeue(rq, p);
- psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
- }
+
+ psi_dequeue(p, flags);
/*
* Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
@@ -2135,16 +2176,18 @@ void check_class_changed(struct rq *rq, struct task_struct *p,
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->sched_class == rq->curr->sched_class)
- rq->curr->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, rq->curr->sched_class))
+ struct task_struct *donor = rq->donor;
+
+ if (p->sched_class == donor->sched_class)
+ donor->sched_class->wakeup_preempt(rq, p, flags);
+ else if (sched_class_above(p->sched_class, donor->sched_class))
resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
+ if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
@@ -2620,9 +2663,7 @@ int push_cpu_stop(void *arg)
// XXX validate p is still the highest prio task
if (task_rq(p) == rq) {
- deactivate_task(rq, p, 0);
- set_task_cpu(p, lowest_rq->cpu);
- activate_task(lowest_rq, p, 0);
+ move_queued_task_locked(rq, lowest_rq, p);
resched_curr(lowest_rq);
}
@@ -2682,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued) {
/*
@@ -2696,6 +2737,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -3308,9 +3350,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, cpu);
- activate_task(dst_rq, p, 0);
+ move_queued_task_locked(src_rq, dst_rq, p);
wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
@@ -4424,7 +4464,8 @@ int wake_up_state(struct task_struct *p, unsigned int state)
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
- * __sched_fork() is basic setup used by init_idle() too:
+ * __sched_fork() is basic setup which is also used by sched_init() to
+ * initialize the boot CPU's idle task.
*/
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
@@ -5517,7 +5558,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && task_on_rq_queued(p)) {
+ if (task_current_donor(rq, p) && task_on_rq_queued(p)) {
prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
@@ -5585,7 +5626,8 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
+ /* accounting goes to the donor task */
+ struct task_struct *donor;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5596,19 +5638,23 @@ void sched_tick(void)
sched_clock_tick();
rq_lock(rq, &rf);
+ donor = rq->donor;
- curr = rq->curr;
- psi_account_irqtime(rq, curr, NULL);
+ psi_account_irqtime(rq, donor, NULL);
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
- curr->sched_class->task_tick(rq, curr, 0);
+
+ if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ resched_curr(rq);
+
+ donor->sched_class->task_tick(rq, donor, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, curr);
+ task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -5618,8 +5664,8 @@ void sched_tick(void)
perf_event_task_tick();
- if (curr->flags & PF_WQ_WORKER)
- wq_worker_tick(curr);
+ if (donor->flags & PF_WQ_WORKER)
+ wq_worker_tick(donor);
#ifdef CONFIG_SMP
if (!scx_switched_all()) {
@@ -5686,6 +5732,12 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr = rq->curr;
if (cpu_online(cpu)) {
+ /*
+ * Since this is a remote tick for full dynticks mode,
+ * we are always sure that there is no proxy (only a
+ * single task is running).
+ */
+ SCHED_WARN_ON(rq->curr != rq->donor);
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -6309,10 +6361,7 @@ static bool try_steal_cookie(int this, int that)
if (sched_task_is_throttled(p, this))
goto next;
- deactivate_task(src, p, 0);
- set_task_cpu(p, this);
- activate_task(dst, p, 0);
-
+ move_queued_task_locked(src, dst, p);
resched_curr(dst);
success = true;
@@ -6507,6 +6556,45 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_RTLOCK_WAIT 2
/*
+ * Helper function for __schedule()
+ *
+ * If a task does not have signals pending, deactivate it
+ * Otherwise marks the task's __state as RUNNING
+ */
+static bool try_to_block_task(struct rq *rq, struct task_struct *p,
+ unsigned long task_state)
+{
+ int flags = DEQUEUE_NOCLOCK;
+
+ if (signal_pending_state(task_state, p)) {
+ WRITE_ONCE(p->__state, TASK_RUNNING);
+ return false;
+ }
+
+ p->sched_contributes_to_load =
+ (task_state & TASK_UNINTERRUPTIBLE) &&
+ !(task_state & TASK_NOLOAD) &&
+ !(task_state & TASK_FROZEN);
+
+ if (unlikely(is_special_task_state(task_state)))
+ flags |= DEQUEUE_SPECIAL;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
+ block_task(rq, p, flags);
+ return true;
+}
+
+/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
@@ -6614,37 +6702,12 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- if (signal_pending_state(prev_state, prev)) {
- WRITE_ONCE(prev->__state, TASK_RUNNING);
- } else {
- int flags = DEQUEUE_NOCLOCK;
-
- prev->sched_contributes_to_load =
- (prev_state & TASK_UNINTERRUPTIBLE) &&
- !(prev_state & TASK_NOLOAD) &&
- !(prev_state & TASK_FROZEN);
-
- if (unlikely(is_special_task_state(prev_state)))
- flags |= DEQUEUE_SPECIAL;
-
- /*
- * __schedule() ttwu()
- * prev_state = prev->state; if (p->on_rq && ...)
- * if (prev_state) goto out;
- * p->on_rq = 0; smp_acquire__after_ctrl_dep();
- * p->state = TASK_WAKING
- *
- * Where __schedule() and ttwu() have matching control dependencies.
- *
- * After this, schedule() must not care about p->state any more.
- */
- block_task(rq, prev, flags);
- block = true;
- }
+ block = try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
+ rq_set_donor(rq, next);
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -7151,7 +7214,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, queue_flag);
if (running)
@@ -7351,6 +7414,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- NOP
* preempt_schedule_notrace <- NOP
* irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
* VOLUNTARY:
* cond_resched <- __cond_resched
@@ -7358,6 +7422,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- NOP
* preempt_schedule_notrace <- NOP
* irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
* FULL:
* cond_resched <- RET0
@@ -7365,6 +7430,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- preempt_schedule
* preempt_schedule_notrace <- preempt_schedule_notrace
* irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- false
+ *
+ * LAZY:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- true
*/
enum {
@@ -7372,30 +7446,41 @@ enum {
preempt_dynamic_none,
preempt_dynamic_voluntary,
preempt_dynamic_full,
+ preempt_dynamic_lazy,
};
int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
+#ifndef CONFIG_PREEMPT_RT
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
+#endif
if (!strcmp(str, "full"))
return preempt_dynamic_full;
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+ if (!strcmp(str, "lazy"))
+ return preempt_dynamic_lazy;
+#endif
+
return -EINVAL;
}
+#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
#else
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
@@ -7415,6 +7500,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
switch (mode) {
case preempt_dynamic_none:
@@ -7424,6 +7510,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: none\n");
break;
@@ -7435,6 +7522,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: voluntary\n");
break;
@@ -7446,9 +7534,22 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: full\n");
break;
+
+ case preempt_dynamic_lazy:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_enable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: lazy\n");
+ break;
}
preempt_dynamic_mode = mode;
@@ -7511,6 +7612,8 @@ static void __init preempt_dynamic_init(void)
sched_dynamic_update(preempt_dynamic_none);
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ sched_dynamic_update(preempt_dynamic_lazy);
} else {
/* Default static call setting, nothing to do */
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7531,6 +7634,7 @@ static void __init preempt_dynamic_init(void)
PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
@@ -7683,8 +7787,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- __sched_fork(0, idle);
-
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_rq_lock(rq);
@@ -7699,10 +7801,8 @@ void __init init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
/*
- * It's possible that init_idle() gets called multiple times on a task,
- * in that case do_set_cpus_allowed() will not do the right thing.
- *
- * And since this is boot we can forgo the serialization.
+ * No validation and serialization required at boot time and for
+ * setting up the idle tasks of not yet online CPUs.
*/
set_cpus_allowed_common(idle, &ac);
#endif
@@ -7721,6 +7821,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->idle = idle;
+ rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
@@ -7810,7 +7911,7 @@ void sched_setnuma(struct task_struct *p, int nid)
rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
@@ -8546,6 +8647,7 @@ void __init sched_init(void)
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
+ __sched_fork(0, current);
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
@@ -8960,7 +9062,7 @@ void sched_move_task(struct task_struct *tsk)
update_rq_clock(rq);
- running = task_current(rq, tsk);
+ running = task_current_donor(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
@@ -10253,6 +10355,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
*/
if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
return -1;
+ WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
return src_cid;
}
@@ -10265,7 +10368,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
{
struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
struct mm_struct *mm = t->mm;
- int src_cid, dst_cid, src_cpu;
+ int src_cid, src_cpu;
+ bool dst_cid_is_set;
struct rq *src_rq;
lockdep_assert_rq_held(dst_rq);
@@ -10282,9 +10386,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
* allocation closest to 0 in cases where few threads migrate around
* many CPUs.
*
- * If destination cid is already set, we may have to just clear
- * the src cid to ensure compactness in frequent migrations
- * scenarios.
+ * If destination cid or recent cid is already set, we may have
+ * to just clear the src cid to ensure compactness in frequent
+ * migrations scenarios.
*
* It is not useful to clear the src cid when the number of threads is
* greater or equal to the number of allowed CPUs, because user-space
@@ -10292,9 +10396,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
* allowed CPUs.
*/
dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid = READ_ONCE(dst_pcpu_cid->cid);
- if (!mm_cid_is_unset(dst_cid) &&
- atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+ dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
+ !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
+ if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
return;
src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
src_rq = cpu_rq(src_cpu);
@@ -10305,13 +10409,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
src_cid);
if (src_cid == -1)
return;
- if (!mm_cid_is_unset(dst_cid)) {
+ if (dst_cid_is_set) {
__mm_cid_put(mm, src_cid);
return;
}
/* Move src_cid to dst cpu. */
mm_cid_snapshot_time(dst_rq, mm);
WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+ WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
}
static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
@@ -10550,7 +10655,7 @@ void sched_mm_cid_after_execve(struct task_struct *t)
* Matches barrier in sched_mm_cid_remote_clear_old().
*/
smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
}
rseq_set_notify_resume(t);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c6ba15388ea7..28c77904ea74 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -783,9 +783,8 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- sugov_eas_rebuild_sd();
-
out:
+ sugov_eas_rebuild_sd();
mutex_unlock(&global_tunables_lock);
return 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index be1b917dc8ce..d9d5a702f1a6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1339,7 +1339,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
+ if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
@@ -1736,11 +1736,11 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
*/
static void update_curr_dl(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
+ struct task_struct *donor = rq->donor;
+ struct sched_dl_entity *dl_se = &donor->dl;
s64 delta_exec;
- if (!dl_task(curr) || !on_dl_rq(dl_se))
+ if (!dl_task(donor) || !on_dl_rq(dl_se))
return;
/*
@@ -2213,7 +2213,7 @@ static int find_later_rq(struct task_struct *task);
static int
select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
bool select_rq;
struct rq *rq;
@@ -2224,6 +2224,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If we are dealing with a -deadline task, we must
@@ -2234,9 +2235,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- select_rq = unlikely(dl_task(curr)) &&
+ select_rq = unlikely(dl_task(donor)) &&
(curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ !dl_entity_preempt(&p->dl, &donor->dl)) &&
p->nr_cpus_allowed > 1;
/*
@@ -2299,7 +2300,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
+ !cpudl_find(&rq->rd->cpudl, rq->donor, NULL))
return;
/*
@@ -2338,7 +2339,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
- if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
}
@@ -2348,7 +2349,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
*/
- if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
#endif /* CONFIG_SMP */
@@ -2380,7 +2381,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (rq->curr->sched_class != &dl_sched_class)
+ if (rq->donor->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
deadline_queue_push_tasks(rq);
@@ -2487,14 +2488,6 @@ static void task_fork_dl(struct task_struct *p)
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
-static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_on_cpu(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_mask))
- return 1;
- return 0;
-}
-
/*
* Return the earliest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise:
@@ -2513,7 +2506,7 @@ next_node:
if (next_node) {
p = __node_2_pdl(next_node);
- if (pick_dl_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
next_node = rb_next(next_node);
@@ -2707,8 +2700,8 @@ retry:
* can move away, it makes sense to just reschedule
* without going further in pushing next_task.
*/
- if (dl_task(rq->curr) &&
- dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ if (dl_task(rq->donor) &&
+ dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
@@ -2751,9 +2744,7 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, later_rq->cpu);
- activate_task(later_rq, next_task, 0);
+ move_queued_task_locked(rq, later_rq, next_task);
ret = 1;
resched_curr(later_rq);
@@ -2833,15 +2824,13 @@ static void pull_dl_task(struct rq *this_rq)
* deadline than the current task of its runqueue.
*/
if (dl_time_before(p->dl.deadline,
- src_rq->curr->dl.deadline))
+ src_rq->donor->dl.deadline))
goto skip;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ move_queued_task_locked(src_rq, this_rq, p);
dmin = p->dl.deadline;
resched = true;
}
@@ -2874,9 +2863,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
- dl_task(rq->curr) &&
+ dl_task(rq->donor) &&
(rq->curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->donor->dl))) {
push_dl_tasks(rq);
}
}
@@ -3051,12 +3040,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
- if (rq->curr != p) {
+ if (rq->donor != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
#endif
- if (dl_task(rq->curr))
+ if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
@@ -3085,7 +3074,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
if (!rq->dl.overloaded)
deadline_queue_pull_task(rq);
- if (task_current(rq, p)) {
+ if (task_current_donor(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f4035c7a0fa1..a48b2a701ec2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
static const char * preempt_modes[] = {
- "none", "voluntary", "full"
+ "none", "voluntary", "full", "lazy",
};
- int i;
+ int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
+ int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
- for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ for (; i < j; i++) {
if (preempt_dynamic_mode == i)
seq_puts(m, "(");
seq_puts(m, preempt_modes[i]);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 751d73d500e5..7fff1d045477 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -199,8 +199,10 @@ struct scx_dump_ctx {
/**
* struct sched_ext_ops - Operation table for BPF scheduler implementation
*
- * Userland can implement an arbitrary scheduling policy by implementing and
- * loading operations in this table.
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
*/
struct sched_ext_ops {
/**
@@ -218,10 +220,15 @@ struct sched_ext_ops {
* dispatch. While an explicit custom mechanism can be added,
* select_cpu() serves as the default way to wake up idle CPUs.
*
- * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
- * is dispatched, the ops.enqueue() callback will be skipped. Finally,
- * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
- * local DSQ of whatever CPU is returned by this callback.
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
*/
s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
@@ -230,12 +237,12 @@ struct sched_ext_ops {
* @p: task being enqueued
* @enq_flags: %SCX_ENQ_*
*
- * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
- * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
- * scheduler owns @p and if it fails to dispatch @p, the task will
- * stall.
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
*
- * If @p was dispatched from ops.select_cpu(), this callback is
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
* skipped.
*/
void (*enqueue)(struct task_struct *p, u64 enq_flags);
@@ -257,17 +264,17 @@ struct sched_ext_ops {
void (*dequeue)(struct task_struct *p, u64 deq_flags);
/**
- * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
+ * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
* @cpu: CPU to dispatch tasks for
* @prev: previous task being switched out
*
* Called when a CPU's local dsq is empty. The operation should dispatch
* one or more tasks from the BPF scheduler into the DSQs using
- * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
- * scx_bpf_consume().
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
*
- * The maximum number of times scx_bpf_dispatch() can be called without
- * an intervening scx_bpf_consume() is specified by
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
* ops.dispatch_max_batch. See the comments on top of the two functions
* for more details.
*
@@ -275,7 +282,7 @@ struct sched_ext_ops {
* @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
* @prev->scx.flags, it is not enqueued yet and will be enqueued after
* ops.dispatch() returns. To keep executing @prev, return without
- * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
*/
void (*dispatch)(s32 cpu, struct task_struct *prev);
@@ -594,7 +601,7 @@ struct sched_ext_ops {
* Update @tg's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-#endif /* CONFIG_CGROUPS */
+#endif /* CONFIG_EXT_GROUP_SCHED */
/*
* All online ops must come before ops.cpu_online().
@@ -707,7 +714,7 @@ enum scx_enq_flags {
/*
* Set the following to trigger preemption when calling
- * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
* current task is cleared to zero and the CPU is kicked into the
* scheduling path. Implies %SCX_ENQ_HEAD.
*/
@@ -862,8 +869,9 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static unsigned long scx_in_softlockup;
+static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
static int scx_ops_bypass_depth;
-static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -876,6 +884,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+#ifdef CONFIG_SMP
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+#endif
+
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -2309,7 +2322,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
/*
* We don't require the BPF scheduler to avoid dispatching to offline
* CPUs mostly for convenience but also because CPUs can go offline
- * between scx_bpf_dispatch() calls and here. Trigger error iff the
+ * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
* picked CPU is outside the allowed mask.
*/
if (!task_allowed_on_cpu(p, cpu)) {
@@ -2397,11 +2410,115 @@ static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *r
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
+/**
+ * move_task_between_dsqs() - Move a task from one DSQ to another
+ * @p: target task
+ * @enq_flags: %SCX_ENQ_*
+ * @src_dsq: DSQ @p is currently on, must not be a local DSQ
+ * @dst_dsq: DSQ @p is being moved to, can be any DSQ
+ *
+ * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
+ * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
+ * will change. As @p's task_rq is locked, this function doesn't need to use the
+ * holding_cpu mechanism.
+ *
+ * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
+ * return value, is locked.
+ */
+static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct scx_dispatch_q *dst_dsq)
+{
+ struct rq *src_rq = task_rq(p), *dst_rq;
+
+ BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(src_rq);
+
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+ if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ dst_dsq = find_global_dsq(p);
+ dst_rq = src_rq;
+ }
+ } else {
+ /* no need to migrate if destination is a non-local DSQ */
+ dst_rq = src_rq;
+ }
+
+ /*
+ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+ * CPU, @p will be migrated.
+ */
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ /* @p is going from a non-local DSQ to a local DSQ */
+ if (src_rq == dst_rq) {
+ task_unlink_from_dsq(p, src_dsq);
+ move_local_task_to_local_dsq(p, enq_flags,
+ src_dsq, dst_rq);
+ raw_spin_unlock(&src_dsq->lock);
+ } else {
+ raw_spin_unlock(&src_dsq->lock);
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ }
+ } else {
+ /*
+ * @p is going from a non-local DSQ to a non-local DSQ. As
+ * $src_dsq is already locked, do an abbreviated dequeue.
+ */
+ task_unlink_from_dsq(p, src_dsq);
+ p->scx.dsq = NULL;
+ raw_spin_unlock(&src_dsq->lock);
+
+ dispatch_enqueue(dst_dsq, p, enq_flags);
+ }
+
+ return dst_rq;
+}
+
+/*
+ * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
+ * banging on the same DSQ on a large NUMA system to the point where switching
+ * to the bypass mode can take a long time. Inject artifical delays while the
+ * bypass mode is switching to guarantee timely completion.
+ */
+static void scx_ops_breather(struct rq *rq)
+{
+ u64 until;
+
+ lockdep_assert_rq_held(rq);
+
+ if (likely(!atomic_read(&scx_ops_breather_depth)))
+ return;
+
+ raw_spin_rq_unlock(rq);
+
+ until = ktime_get_ns() + NSEC_PER_MSEC;
+
+ do {
+ int cnt = 1024;
+ while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ cpu_relax();
+ } while (atomic_read(&scx_ops_breather_depth) &&
+ time_before64(ktime_get_ns(), until));
+
+ raw_spin_rq_lock(rq);
+}
+
static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
+ * This retry loop can repeatedly race against scx_ops_bypass()
+ * dequeueing tasks from @dsq trying to put the system into the bypass
+ * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
+ * live-lock the machine into soft lockups. Give a breather.
+ */
+ scx_ops_breather(rq);
+
+ /*
* The caller can't expect to successfully consume a task if the task's
* addition to @dsq isn't guaranteed to be visible somehow. Test
* @dsq->list without locking and skip if it seems empty.
@@ -2541,7 +2658,7 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* Dispatching to local DSQs may need to wait for queueing to complete or
* require rq lock dancing. As we don't wanna do either while inside
* ops.dispatch() to avoid locking order inversion, we split dispatching into
- * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
+ * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
* task and its qseq. Once ops.dispatch() returns, this function is called to
* finish up.
*
@@ -2573,7 +2690,7 @@ retry:
/*
* If qseq doesn't match, @p has gone through at least one
* dispatch/dequeue and re-enqueue cycle between
- * scx_bpf_dispatch() and here and we have no claim on it.
+ * scx_bpf_dsq_insert() and here and we have no claim on it.
*/
if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
return;
@@ -2642,7 +2759,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* If the previous sched_class for the current CPU was not SCX,
* notify the BPF scheduler that it again has control of the
* core. This callback complements ->cpu_release(), which is
- * emitted in scx_next_task_picked().
+ * emitted in switch_class().
*/
if (SCX_HAS_OP(cpu_acquire))
SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
@@ -3098,28 +3215,216 @@ found:
goto retry;
}
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu) {
+ const struct cpumask *numa_cpus;
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return true;
+
+ numa_cpus = cpumask_of_node(cpu_to_node(cpu));
+ if (sd->span_weight != cpumask_weight(numa_cpus))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+static void update_selcpu_topology(void)
+{
+ bool enable_llc = false, enable_numa = false;
+ struct sched_domain *sd;
+ const struct cpumask *cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (sd) {
+ if (sd->span_weight < num_online_cpus())
+ enable_llc = true;
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ */
+ cpus = cpumask_of_node(cpu_to_node(cpu));
+ if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
+ enable_numa = true;
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ enable_llc ? "enabled" : "disabled");
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ enable_numa ? "enabled" : "disabled");
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively, multiple
+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa).
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *found)
{
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
s32 cpu;
*found = false;
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
+ if (sd)
+ llc_cpus = sched_domain_span(sd);
+ }
+ }
+
/*
- * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
- * under utilized, wake up @p to the local DSQ of the waker. Checking
- * only for an empty local DSQ is insufficient as it could give the
- * wakee an unfair advantage when the system is oversaturated.
- * Checking only for the presence of idle CPUs is also insufficient as
- * the local DSQ of the waker could have tasks piled up on it even if
- * there is an idle core elsewhere on the system.
- */
- cpu = smp_processor_id();
- if ((wake_flags & SCX_WAKE_SYNC) &&
- !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ cpu = smp_processor_id();
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ if (cpus_share_cache(cpu, prev_cpu) &&
+ test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
goto cpu_found;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ if (!cpumask_empty(idle_masks.cpu) &&
+ !(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto cpu_found;
+ }
}
/*
@@ -3127,29 +3432,80 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* partially idle @prev_cpu.
*/
if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto cpu_found;
}
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any full idle core usable by the task.
+ */
cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto cpu_found;
}
+ /*
+ * Use @prev_cpu if it's idle.
+ */
if (test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto cpu_found;
}
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = scx_pick_idle_cpu(llc_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ */
cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
goto cpu_found;
+ rcu_read_unlock();
return prev_cpu;
cpu_found:
+ rcu_read_unlock();
+
*found = true;
return cpu;
}
@@ -3272,6 +3628,9 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
+ if (scx_enabled())
+ update_selcpu_topology();
+
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
else if (!online && SCX_HAS_OP(cpu_offline))
@@ -3567,12 +3926,7 @@ static void scx_ops_exit_task(struct task_struct *p)
void init_scx_entity(struct sched_ext_entity *scx)
{
- /*
- * init_idle() calls this function again after fork sequence is
- * complete. Don't touch ->tasks_node as it's already linked.
- */
- memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
-
+ memset(scx, 0, sizeof(*scx));
INIT_LIST_HEAD(&scx->dsq_list.node);
RB_CLEAR_NODE(&scx->dsq_priq);
scx->sticky_cpu = -1;
@@ -4286,6 +4640,49 @@ bool task_should_scx(int policy)
}
/**
+ * scx_softlockup - sched_ext softlockup handler
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system by enabling the breather and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+ switch (scx_ops_enable_state()) {
+ case SCX_OPS_ENABLING:
+ case SCX_OPS_ENABLED:
+ break;
+ default:
+ return;
+ }
+
+ /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ if (test_and_set_bit(0, &scx_in_softlockup))
+ return;
+
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
+ smp_processor_id(), dur_s, scx_ops.name);
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Enable breather
+ * immediately; otherwise, we might even be able to get to
+ * scx_ops_bypass().
+ */
+ atomic_inc(&scx_ops_breather_depth);
+
+ scx_ops_error("soft lockup - CPU#%d stuck for %us",
+ smp_processor_id(), dur_s);
+}
+
+static void scx_clear_softlockup(void)
+{
+ if (test_and_clear_bit(0, &scx_in_softlockup))
+ atomic_dec(&scx_ops_breather_depth);
+}
+
+/**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4317,10 +4714,11 @@ bool task_should_scx(int policy)
*/
static void scx_ops_bypass(bool bypass)
{
+ static DEFINE_RAW_SPINLOCK(bypass_lock);
int cpu;
unsigned long flags;
- raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
+ raw_spin_lock_irqsave(&bypass_lock, flags);
if (bypass) {
scx_ops_bypass_depth++;
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
@@ -4333,6 +4731,8 @@ static void scx_ops_bypass(bool bypass)
goto unlock;
}
+ atomic_inc(&scx_ops_breather_depth);
+
/*
* No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_rq_bypassing()
@@ -4388,8 +4788,11 @@ static void scx_ops_bypass(bool bypass)
/* resched to restore ticks and idle state */
resched_cpu(cpu);
}
+
+ atomic_dec(&scx_ops_breather_depth);
unlock:
- raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
+ raw_spin_unlock_irqrestore(&bypass_lock, flags);
+ scx_clear_softlockup();
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -5100,6 +5503,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
+#ifdef CONFIG_SMP
+ update_selcpu_topology();
+#endif
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5307,67 +5713,7 @@ err_disable:
#include <linux/bpf.h>
#include <linux/btf.h>
-extern struct btf *btf_vmlinux;
static const struct btf_type *task_struct_type;
-static u32 task_struct_type_id;
-
-static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
- enum bpf_access_type type,
- const struct bpf_prog *prog,
- struct bpf_insn_access_aux *info)
-{
- struct btf *btf = bpf_get_btf_vmlinux();
- const struct bpf_struct_ops_desc *st_ops_desc;
- const struct btf_member *member;
- const struct btf_type *t;
- u32 btf_id, member_idx;
- const char *mname;
-
- /* struct_ops op args are all sequential, 64-bit numbers */
- if (off != arg_n * sizeof(__u64))
- return false;
-
- /* btf_id should be the type id of struct sched_ext_ops */
- btf_id = prog->aux->attach_btf_id;
- st_ops_desc = bpf_struct_ops_find(btf, btf_id);
- if (!st_ops_desc)
- return false;
-
- /* BTF type of struct sched_ext_ops */
- t = st_ops_desc->type;
-
- member_idx = prog->expected_attach_type;
- if (member_idx >= btf_type_vlen(t))
- return false;
-
- /*
- * Get the member name of this struct_ops program, which corresponds to
- * a field in struct sched_ext_ops. For example, the member name of the
- * dispatch struct_ops program (callback) is "dispatch".
- */
- member = &btf_type_member(t)[member_idx];
- mname = btf_name_by_offset(btf_vmlinux, member->name_off);
-
- if (!strcmp(mname, op)) {
- /*
- * The value is a pointer to a type (struct task_struct) given
- * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
- * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
- * should check the pointer to make sure it is not NULL before
- * using it, or the verifier will reject the program.
- *
- * Longer term, this is something that should be addressed by
- * BTF, and be fully contained within the verifier.
- */
- info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
- info->btf = btf_vmlinux;
- info->btf_id = task_struct_type_id;
-
- return true;
- }
-
- return false;
-}
static bool bpf_scx_is_valid_access(int off, int size,
enum bpf_access_type type,
@@ -5376,9 +5722,6 @@ static bool bpf_scx_is_valid_access(int off, int size,
{
if (type != BPF_READ)
return false;
- if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) ||
- set_arg_maybe_null("yield", 1, off, size, type, prog, info))
- return true;
if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
return false;
if (off % size != 0)
@@ -5513,13 +5856,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
static int bpf_scx_init(struct btf *btf)
{
- s32 type_id;
-
- type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
- if (type_id < 0)
- return -EINVAL;
- task_struct_type = btf_type_by_id(btf, type_id);
- task_struct_type_id = type_id;
+ task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
return 0;
}
@@ -5541,78 +5878,78 @@ static int bpf_scx_validate(void *kdata)
return 0;
}
-static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
-static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
-static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
-static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
-static void tick_stub(struct task_struct *p) {}
-static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
-static void running_stub(struct task_struct *p) {}
-static void stopping_stub(struct task_struct *p, bool runnable) {}
-static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
-static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
-static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
-static void set_weight_stub(struct task_struct *p, u32 weight) {}
-static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
-static void update_idle_stub(s32 cpu, bool idle) {}
-static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
-static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
-static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
-static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
-static void enable_stub(struct task_struct *p) {}
-static void disable_stub(struct task_struct *p) {}
+static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
+static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
+static void sched_ext_ops__tick(struct task_struct *p) {}
+static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__running(struct task_struct *p) {}
+static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
+static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
+static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
+static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
+static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
+static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
+static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
+static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
+static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
+static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void sched_ext_ops__enable(struct task_struct *p) {}
+static void sched_ext_ops__disable(struct task_struct *p) {}
#ifdef CONFIG_EXT_GROUP_SCHED
-static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
-static void cgroup_exit_stub(struct cgroup *cgrp) {}
-static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
-static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
+static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
+static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
#endif
-static void cpu_online_stub(s32 cpu) {}
-static void cpu_offline_stub(s32 cpu) {}
-static s32 init_stub(void) { return -EINVAL; }
-static void exit_stub(struct scx_exit_info *info) {}
-static void dump_stub(struct scx_dump_ctx *ctx) {}
-static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
-static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+static void sched_ext_ops__cpu_online(s32 cpu) {}
+static void sched_ext_ops__cpu_offline(s32 cpu) {}
+static s32 sched_ext_ops__init(void) { return -EINVAL; }
+static void sched_ext_ops__exit(struct scx_exit_info *info) {}
+static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
+static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
+static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
- .select_cpu = select_cpu_stub,
- .enqueue = enqueue_stub,
- .dequeue = dequeue_stub,
- .dispatch = dispatch_stub,
- .tick = tick_stub,
- .runnable = runnable_stub,
- .running = running_stub,
- .stopping = stopping_stub,
- .quiescent = quiescent_stub,
- .yield = yield_stub,
- .core_sched_before = core_sched_before_stub,
- .set_weight = set_weight_stub,
- .set_cpumask = set_cpumask_stub,
- .update_idle = update_idle_stub,
- .cpu_acquire = cpu_acquire_stub,
- .cpu_release = cpu_release_stub,
- .init_task = init_task_stub,
- .exit_task = exit_task_stub,
- .enable = enable_stub,
- .disable = disable_stub,
+ .select_cpu = sched_ext_ops__select_cpu,
+ .enqueue = sched_ext_ops__enqueue,
+ .dequeue = sched_ext_ops__dequeue,
+ .dispatch = sched_ext_ops__dispatch,
+ .tick = sched_ext_ops__tick,
+ .runnable = sched_ext_ops__runnable,
+ .running = sched_ext_ops__running,
+ .stopping = sched_ext_ops__stopping,
+ .quiescent = sched_ext_ops__quiescent,
+ .yield = sched_ext_ops__yield,
+ .core_sched_before = sched_ext_ops__core_sched_before,
+ .set_weight = sched_ext_ops__set_weight,
+ .set_cpumask = sched_ext_ops__set_cpumask,
+ .update_idle = sched_ext_ops__update_idle,
+ .cpu_acquire = sched_ext_ops__cpu_acquire,
+ .cpu_release = sched_ext_ops__cpu_release,
+ .init_task = sched_ext_ops__init_task,
+ .exit_task = sched_ext_ops__exit_task,
+ .enable = sched_ext_ops__enable,
+ .disable = sched_ext_ops__disable,
#ifdef CONFIG_EXT_GROUP_SCHED
- .cgroup_init = cgroup_init_stub,
- .cgroup_exit = cgroup_exit_stub,
- .cgroup_prep_move = cgroup_prep_move_stub,
- .cgroup_move = cgroup_move_stub,
- .cgroup_cancel_move = cgroup_cancel_move_stub,
- .cgroup_set_weight = cgroup_set_weight_stub,
+ .cgroup_init = sched_ext_ops__cgroup_init,
+ .cgroup_exit = sched_ext_ops__cgroup_exit,
+ .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
+ .cgroup_move = sched_ext_ops__cgroup_move,
+ .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
+ .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
#endif
- .cpu_online = cpu_online_stub,
- .cpu_offline = cpu_offline_stub,
- .init = init_stub,
- .exit = exit_stub,
- .dump = dump_stub,
- .dump_cpu = dump_cpu_stub,
- .dump_task = dump_task_stub,
+ .cpu_online = sched_ext_ops__cpu_online,
+ .cpu_offline = sched_ext_ops__cpu_offline,
+ .init = sched_ext_ops__init,
+ .exit = sched_ext_ops__exit,
+ .dump = sched_ext_ops__dump,
+ .dump_cpu = sched_ext_ops__dump_cpu,
+ .dump_task = sched_ext_ops__dump_task,
};
static struct bpf_struct_ops bpf_sched_ext_ops = {
@@ -5759,7 +6096,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
if (cpu != cpu_of(this_rq)) {
/*
* Pairs with smp_store_release() issued by this CPU in
- * scx_next_task_picked() on the resched path.
+ * switch_class() on the resched path.
*
* We busy-wait here to guarantee that no other task can
* be scheduled on our core before the target CPU has
@@ -5944,7 +6281,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
.set = &scx_kfunc_ids_select_cpu,
};
-static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
return false;
@@ -5964,7 +6301,8 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
return true;
}
-static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
+static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct task_struct *ddsp_task;
@@ -5991,14 +6329,14 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
__bpf_kfunc_start_defs();
/**
- * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
- * @p: task_struct to dispatch
- * @dsq_id: DSQ to dispatch to
+ * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
* @slice: duration @p can run for in nsecs, 0 to keep the current value
* @enq_flags: SCX_ENQ_*
*
- * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
- * to call this function spuriously. Can be called from ops.enqueue(),
+ * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
+ * call this function spuriously. Can be called from ops.enqueue(),
* ops.select_cpu(), and ops.dispatch().
*
* When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
@@ -6007,14 +6345,14 @@ __bpf_kfunc_start_defs();
* ops.select_cpu() to be on the target CPU in the first place.
*
* When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
- * will be directly dispatched to the corresponding dispatch queue after
- * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
- * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
+ * will be directly inserted into the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
+ * inserted into the local DSQ of the CPU returned by ops.select_cpu().
* @enq_flags are OR'd with the enqueue flags on the enqueue path before the
- * task is dispatched.
+ * task is inserted.
*
* When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
- * and this function can be called upto ops.dispatch_max_batch times to dispatch
+ * and this function can be called upto ops.dispatch_max_batch times to insert
* multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
* remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
*
@@ -6026,10 +6364,10 @@ __bpf_kfunc_start_defs();
* %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
* scx_bpf_kick_cpu() to trigger scheduling.
*/
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
{
- if (!scx_dispatch_preamble(p, enq_flags))
+ if (!scx_dsq_insert_preamble(p, enq_flags))
return;
if (slice)
@@ -6037,30 +6375,42 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
else
p->scx.slice = p->scx.slice ?: 1;
- scx_dispatch_commit(p, dsq_id, enq_flags);
+ scx_dsq_insert_commit(p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
+ scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
}
/**
- * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
- * @p: task_struct to dispatch
- * @dsq_id: DSQ to dispatch to
+ * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
* @slice: duration @p can run for in nsecs, 0 to keep the current value
* @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
* @enq_flags: SCX_ENQ_*
*
- * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
- * Tasks queued into the priority queue are ordered by @vtime and always
- * consumed after the tasks in the FIFO queue. All other aspects are identical
- * to scx_bpf_dispatch().
+ * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime. All other aspects
+ * are identical to scx_bpf_dsq_insert().
*
* @vtime ordering is according to time_before64() which considers wrapping. A
* numerically larger vtime may indicate an earlier position in the ordering and
* vice-versa.
+ *
+ * A DSQ can only be used as a FIFO or priority queue at any given time and this
+ * function must not be called on a DSQ which already has one or more FIFO tasks
+ * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
+ * SCX_DSQ_GLOBAL) cannot be used as priority queues.
*/
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dispatch_preamble(p, enq_flags))
+ if (!scx_dsq_insert_preamble(p, enq_flags))
return;
if (slice)
@@ -6070,12 +6420,22 @@ __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
p->scx.dsq_vtime = vtime;
- scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
+ scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
@@ -6085,12 +6445,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
.set = &scx_kfunc_ids_enqueue_dispatch,
};
-static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
+static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
+ struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
- struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
+ struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
bool in_balance;
unsigned long flags;
@@ -6118,6 +6477,13 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
raw_spin_rq_lock(src_rq);
}
+ /*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q(). Insert a
+ * breather if necessary.
+ */
+ scx_ops_breather(src_rq);
+
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6136,51 +6502,18 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
/* @p is still on $src_dsq and stable, determine the destination */
dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
- if (dst_dsq->id == SCX_DSQ_LOCAL) {
- dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
- if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
- dst_dsq = find_global_dsq(p);
- dst_rq = src_rq;
- }
- } else {
- /* no need to migrate if destination is a non-local DSQ */
- dst_rq = src_rq;
- }
-
/*
- * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
- * CPU, @p will be migrated.
+ * Apply vtime and slice updates before moving so that the new time is
+ * visible before inserting into $dst_dsq. @p is still on $src_dsq but
+ * this is safe as we're locking it.
*/
- if (dst_dsq->id == SCX_DSQ_LOCAL) {
- /* @p is going from a non-local DSQ to a local DSQ */
- if (src_rq == dst_rq) {
- task_unlink_from_dsq(p, src_dsq);
- move_local_task_to_local_dsq(p, enq_flags,
- src_dsq, dst_rq);
- raw_spin_unlock(&src_dsq->lock);
- } else {
- raw_spin_unlock(&src_dsq->lock);
- move_remote_task_to_local_dsq(p, enq_flags,
- src_rq, dst_rq);
- locked_rq = dst_rq;
- }
- } else {
- /*
- * @p is going from a non-local DSQ to a non-local DSQ. As
- * $src_dsq is already locked, do an abbreviated dequeue.
- */
- task_unlink_from_dsq(p, src_dsq);
- p->scx.dsq = NULL;
- raw_spin_unlock(&src_dsq->lock);
-
- if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
- p->scx.dsq_vtime = kit->vtime;
- dispatch_enqueue(dst_dsq, p, enq_flags);
- }
-
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+ p->scx.dsq_vtime = kit->vtime;
if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
p->scx.slice = kit->slice;
+ /* execute move */
+ locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
dispatched = true;
out:
if (in_balance) {
@@ -6232,21 +6565,20 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
}
/**
- * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
- * @dsq_id: DSQ to consume
+ * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to move task from
*
- * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
- * to the current CPU's local DSQ for execution. Can only be called from
- * ops.dispatch().
+ * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
+ * local DSQ for execution. Can only be called from ops.dispatch().
*
- * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
- * trying to consume the specified DSQ. It may also grab rq locks and thus can't
- * be called under any BPF locks.
+ * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
+ * before trying to move from the specified DSQ. It may also grab rq locks and
+ * thus can't be called under any BPF locks.
*
- * Returns %true if a task has been consumed, %false if there isn't any task to
- * consume.
+ * Returns %true if a task has been moved, %false if there isn't any task to
+ * move.
*/
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
@@ -6276,17 +6608,24 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
}
}
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
+ return scx_bpf_dsq_move_to_local(dsq_id);
+}
+
/**
- * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
+ * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
- * @slice: duration the dispatched task can run for in nsecs
+ * @slice: duration the moved task can run for in nsecs
*
- * Override the slice of the next task that will be dispatched from @it__iter
- * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
- * the previous slice duration is kept.
+ * Override the slice of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
+ * slice duration is kept.
*/
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
- struct bpf_iter_scx_dsq *it__iter, u64 slice)
+__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
+ u64 slice)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
@@ -6294,18 +6633,26 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
}
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
+ struct bpf_iter_scx_dsq *it__iter, u64 slice)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
+ scx_bpf_dsq_move_set_slice(it__iter, slice);
+}
+
/**
- * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
+ * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
* @it__iter: DSQ iterator in progress
* @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
*
- * Override the vtime of the next task that will be dispatched from @it__iter
- * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
- * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
- * dispatch the next task, the override is ignored and cleared.
+ * Override the vtime of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
+ * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
+ * override is ignored and cleared.
*/
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
- struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
+ u64 vtime)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
@@ -6313,8 +6660,16 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
}
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
+ struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
+ scx_bpf_dsq_move_set_vtime(it__iter, vtime);
+}
+
/**
- * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
+ * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
* @it__iter: DSQ iterator in progress
* @p: task to transfer
* @dsq_id: DSQ to move @p to
@@ -6329,8 +6684,7 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
* @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
* been queued before the iteration started.
*
- * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
- * update.
+ * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
*
* Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
* lock (e.g. BPF timers or SYSCALL programs).
@@ -6338,16 +6692,25 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
* Returns %true if @p has been consumed, %false if @p had already been consumed
* or dequeued.
*/
+__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
- return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
- p, dsq_id, enq_flags);
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
+ return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
}
/**
- * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
+ * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
* @it__iter: DSQ iterator in progress
* @p: task to transfer
* @dsq_id: DSQ to move @p to
@@ -6357,19 +6720,27 @@ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
* priority queue of the DSQ specified by @dsq_id. The destination must be a
* user DSQ as only user DSQs support priority queue.
*
- * @p's slice and vtime are kept by default. Use
- * scx_bpf_dispatch_from_dsq_set_slice() and
- * scx_bpf_dispatch_from_dsq_set_vtime() to update.
+ * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
+ * and scx_bpf_dsq_move_set_vtime() to update.
*
- * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
- * scx_bpf_dispatch_vtime() for more information on @vtime.
+ * All other aspects are identical to scx_bpf_dsq_move(). See
+ * scx_bpf_dsq_insert_vtime() for more information on @vtime.
*/
+__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
- return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
- p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
+ return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
}
__bpf_kfunc_end_defs();
@@ -6377,7 +6748,12 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
@@ -6478,6 +6854,12 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
@@ -7153,15 +7535,8 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
goto out;
- /*
- * A task_group may either be a cgroup or an autogroup. In the latter
- * case, @tg->css.cgroup is %NULL. A task_group can't become the other
- * kind once created.
- */
- if (tg && tg->css.cgroup)
- cgrp = tg->css.cgroup;
- else
- cgrp = &cgrp_dfl_root.cgrp;
+ cgrp = tg_cgrp(tg);
+
out:
cgroup_get(cgrp);
return cgrp;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2d16c8545c71..fbdca89c677f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1200,12 +1200,12 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
*/
s64 update_curr_common(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
s64 delta_exec;
- delta_exec = update_curr_se(rq, &curr->se);
+ delta_exec = update_curr_se(rq, &donor->se);
if (likely(delta_exec > 0))
- update_curr_task(curr, delta_exec);
+ update_curr_task(donor, delta_exec);
return delta_exec;
}
@@ -1251,14 +1251,14 @@ static void update_curr(struct cfs_rq *cfs_rq)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
- resched_curr(rq);
+ resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
}
static void update_curr_fair(struct rq *rq)
{
- update_curr(cfs_rq_of(&rq->curr->se));
+ update_curr(cfs_rq_of(&rq->donor->se));
}
static inline void
@@ -5280,7 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -5678,15 +5678,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
- /*
- * don't let the period tick interfere with the hrtick preemption
- */
- if (!sched_feat(DOUBLE_TICK) &&
- hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
- return;
#endif
}
@@ -6822,7 +6816,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
s64 delta = slice - ran;
if (delta < 0) {
- if (task_current(rq, p))
+ if (task_current_donor(rq, p))
resched_curr(rq);
return;
}
@@ -6837,12 +6831,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
*/
static void hrtick_update(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
- if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
return;
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, donor);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -8763,9 +8757,9 @@ static void set_next_buddy(struct sched_entity *se)
*/
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
- struct task_struct *curr = rq->curr;
- struct sched_entity *se = &curr->se, *pse = &p->se;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct task_struct *donor = rq->donor;
+ struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
@@ -8794,7 +8788,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
- if (test_tsk_need_resched(curr))
+ if (test_tsk_need_resched(rq->curr))
return;
if (!sched_feat(WAKEUP_PREEMPTION))
@@ -8842,7 +8836,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
static struct task_struct *pick_task_fair(struct rq *rq)
@@ -13093,7 +13087,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (task_current(rq, p)) {
+ if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
@@ -13200,7 +13194,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (task_current(rq, p))
+ if (task_current_donor(rq, p))
resched_curr(rq);
else
wakeup_preempt(rq, p, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 290874079f60..a3d331dd2d8f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -19,7 +19,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
*/
SCHED_FEAT(RUN_TO_PARITY, true)
/*
- * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
+ * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
* current.
*/
SCHED_FEAT(PREEMPT_SHORT, true)
@@ -56,7 +56,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(HRTICK_DL, false)
-SCHED_FEAT(DOUBLE_TICK, false)
/*
* Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d2f096bb274c..621696269584 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -271,7 +271,6 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- rmb();
/*
* Interrupts shouldn't be re-enabled from that point on until
@@ -399,8 +398,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- it.timer.function = idle_inject_timer_fn;
+ hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
HRTIMER_MODE_REL_PINNED_HARD);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a9c65d97b3ca..fc07382361a8 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -476,7 +476,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
bool update_other_load_avgs(struct rq *rq)
{
u64 now = rq_clock_pelt(rq);
- const struct sched_class *curr_class = rq->curr->sched_class;
+ const struct sched_class *curr_class = rq->donor->sched_class;
unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
lockdep_assert_rq_held(rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 172c588de542..bd66a46b06ac 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -528,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
@@ -542,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, 0);
- if (rt_rq->highest_prio.curr < curr->prio)
+ if (rt_rq->highest_prio.curr < donor->prio)
resched_curr(rq);
}
}
@@ -988,10 +988,10 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
*/
static void update_curr_rt(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
s64 delta_exec;
- if (curr->sched_class != &rt_sched_class)
+ if (donor->sched_class != &rt_sched_class)
return;
delta_exec = update_curr_common(rq);
@@ -999,7 +999,7 @@ static void update_curr_rt(struct rq *rq)
return;
#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity *rt_se = &curr->rt;
+ struct sched_rt_entity *rt_se = &donor->rt;
if (!rt_bandwidth_enabled())
return;
@@ -1535,7 +1535,7 @@ static int find_lowest_rq(struct task_struct *task);
static int
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
struct rq *rq;
bool test;
@@ -1547,6 +1547,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1575,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
* systems like big.LITTLE.
*/
test = curr &&
- unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+ unlikely(rt_task(donor)) &&
+ (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio);
if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
@@ -1606,12 +1607,8 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- /*
- * Current can't be migrated, useless to reschedule,
- * let's hope p can move out.
- */
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->donor, NULL))
return;
/*
@@ -1654,7 +1651,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
*/
static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->prio < rq->curr->prio) {
+ struct task_struct *donor = rq->donor;
+
+ if (p->prio < donor->prio) {
resched_curr(rq);
return;
}
@@ -1672,7 +1671,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
* to move current somewhere else, making room for our non-migratable
* task.
*/
- if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
#endif
}
@@ -1697,7 +1696,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
* utilization. We only care of the case where we start to schedule a
* rt task
*/
- if (rq->curr->sched_class != &rt_sched_class)
+ if (rq->donor->sched_class != &rt_sched_class)
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
rt_queue_push_tasks(rq);
@@ -1773,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_on_cpu(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_mask))
- return 1;
-
- return 0;
-}
-
/*
* Return the highest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise
@@ -1795,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
return NULL;
plist_for_each_entry(p, head, pushable_tasks) {
- if (pick_rt_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
}
@@ -1968,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
@@ -2000,7 +1991,7 @@ retry:
* higher priority than current. If that's the case
* just reschedule current.
*/
- if (unlikely(next_task->prio < rq->curr->prio)) {
+ if (unlikely(next_task->prio < rq->donor->prio)) {
resched_curr(rq);
return 0;
}
@@ -2021,7 +2012,7 @@ retry:
* Note that the stoppers are masqueraded as SCHED_FIFO
* (cf. sched_set_stop_task()), so we can't rely on rt_task().
*/
- if (rq->curr->sched_class != &rt_sched_class)
+ if (rq->donor->sched_class != &rt_sched_class)
return 0;
cpu = find_lowest_rq(rq->curr);
@@ -2088,9 +2079,7 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, lowest_rq->cpu);
- activate_task(lowest_rq, next_task, 0);
+ move_queued_task_locked(rq, lowest_rq, next_task);
resched_curr(lowest_rq);
ret = 1;
@@ -2355,15 +2344,13 @@ static void pull_rt_task(struct rq *this_rq)
* p if it is lower in priority than the
* current task on the run queue
*/
- if (p->prio < src_rq->curr->prio)
+ if (p->prio < src_rq->donor->prio)
goto skip;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ move_queued_task_locked(src_rq, this_rq, p);
resched = true;
}
/*
@@ -2399,9 +2386,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
bool need_to_push = !task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (dl_task(rq->donor) || rt_task(rq->donor)) &&
(rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio);
+ rq->donor->prio <= p->prio);
if (need_to_push)
push_rt_tasks(rq);
@@ -2485,7 +2472,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
+ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
@@ -2500,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (task_current(rq, p)) {
+ if (task_current_donor(rq, p)) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
@@ -2526,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* greater than the current running task
* then reschedule.
*/
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->donor->prio)
resched_curr(rq);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c03b3d7b320e..76f5f53a645f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1148,7 +1148,10 @@ struct rq {
*/
unsigned int nr_uninterruptible;
- struct task_struct __rcu *curr;
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
@@ -1345,6 +1348,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ /* Do nothing */
+}
+
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -2086,34 +2094,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
#endif /* CONFIG_SMP */
-#include "stats.h"
-
-#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-
-extern void __sched_core_account_forceidle(struct rq *rq);
-
-static inline void sched_core_account_forceidle(struct rq *rq)
-{
- if (schedstat_enabled())
- __sched_core_account_forceidle(rq);
-}
-
-extern void __sched_core_tick(struct rq *rq);
-
-static inline void sched_core_tick(struct rq *rq)
-{
- if (sched_core_enabled(rq) && schedstat_enabled())
- __sched_core_tick(rq);
-}
-
-#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-
-static inline void sched_core_account_forceidle(struct rq *rq) { }
-
-static inline void sched_core_tick(struct rq *rq) { }
-
-#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
-
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -2261,11 +2241,25 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
+/*
+ * Is p the current execution context?
+ */
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
+/*
+ * Is p the current scheduling context?
+ *
+ * Note that it might be the current execution context at the same time if
+ * rq->curr == rq->donor == p.
+ */
+static inline int task_current_donor(struct rq *rq, struct task_struct *p)
+{
+ return rq->donor == p;
+}
+
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
@@ -2452,7 +2446,7 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- WARN_ON_ONCE(rq->curr != prev);
+ WARN_ON_ONCE(rq->donor != prev);
prev->sched_class->put_prev_task(rq, prev, NULL);
}
@@ -2616,7 +2610,7 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node)
static inline struct task_struct *get_push_task(struct rq *rq)
{
- struct task_struct *p = rq->curr;
+ struct task_struct *p = rq->donor;
lockdep_assert_rq_held(rq);
@@ -2696,6 +2690,7 @@ extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
extern void resched_cpu(int cpu);
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
@@ -3200,6 +3195,34 @@ extern void nohz_run_idle_balance(int cpu);
static inline void nohz_run_idle_balance(int cpu) { }
#endif
+#include "stats.h"
+
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
@@ -3630,24 +3653,41 @@ static inline void mm_cid_put(struct mm_struct *mm)
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
-static inline int __mm_cid_try_get(struct mm_struct *mm)
+static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
{
- struct cpumask *cpumask;
- int cid;
+ struct cpumask *cidmask = mm_cidmask(mm);
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid = __this_cpu_read(pcpu_cid->recent_cid);
- cpumask = mm_cidmask(mm);
+ /* Try to re-use recent cid. This improves cache locality. */
+ if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask))
+ return cid;
/*
+ * Expand cid allocation if the maximum number of concurrency
+ * IDs allocated (max_nr_cid) is below the number cpus allowed
+ * and number of threads. Expanding cid allocation as much as
+ * possible improves cache locality.
+ */
+ cid = atomic_read(&mm->max_nr_cid);
+ while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
+ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
+ continue;
+ if (!cpumask_test_and_set_cpu(cid, cidmask))
+ return cid;
+ }
+ /*
+ * Find the first available concurrency id.
* Retry finding first zero bit if the mask is temporarily
* filled. This only happens during concurrent remote-clear
* which owns a cid without holding a rq lock.
*/
for (;;) {
- cid = cpumask_first_zero(cpumask);
- if (cid < nr_cpu_ids)
+ cid = cpumask_first_zero(cidmask);
+ if (cid < READ_ONCE(mm->nr_cpus_allowed))
break;
cpu_relax();
}
- if (cpumask_test_and_set_cpu(cid, cpumask))
+ if (cpumask_test_and_set_cpu(cid, cidmask))
return -1;
return cid;
@@ -3665,7 +3705,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
WRITE_ONCE(pcpu_cid->time, rq->clock);
}
-static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
+static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
+ struct mm_struct *mm)
{
int cid;
@@ -3675,13 +3716,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
* guarantee forward progress.
*/
if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(mm);
+ cid = __mm_cid_try_get(t, mm);
if (cid >= 0)
goto end;
raw_spin_lock(&cid_lock);
} else {
raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(mm);
+ cid = __mm_cid_try_get(t, mm);
if (cid >= 0)
goto unlock;
}
@@ -3701,7 +3742,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
* all newcoming allocations observe the use_cid_lock flag set.
*/
do {
- cid = __mm_cid_try_get(mm);
+ cid = __mm_cid_try_get(t, mm);
cpu_relax();
} while (cid < 0);
/*
@@ -3718,7 +3759,8 @@ end:
return cid;
}
-static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
+static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
+ struct mm_struct *mm)
{
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
struct cpumask *cpumask;
@@ -3735,8 +3777,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
- cid = __mm_cid_get(rq, mm);
+ cid = __mm_cid_get(rq, t, mm);
__this_cpu_write(pcpu_cid->cid, cid);
+ __this_cpu_write(pcpu_cid->recent_cid, cid);
return cid;
}
@@ -3789,7 +3832,7 @@ static inline void switch_mm_cid(struct rq *rq,
prev->mm_cid = -1;
}
if (next->mm_cid_active)
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
+ next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
}
#else /* !CONFIG_SCHED_MM_CID: */
@@ -3802,6 +3845,28 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SMP
+static inline
+void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
+{
+ lockdep_assert_rq_held(src_rq);
+ lockdep_assert_rq_held(dst_rq);
+
+ deactivate_task(src_rq, task, 0);
+ set_task_cpu(task, dst_rq->cpu);
+ activate_task(dst_rq, task, 0);
+}
+
+static inline
+bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_on_cpu(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_mask))
+ return true;
+
+ return false;
+}
+#endif
#ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 767e098a3bd1..8ee0add5a48a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
* go through migration requeues. In this case, *sleeping* states need
* to be transferred.
*/
-static inline void psi_enqueue(struct task_struct *p, bool migrate)
+static inline void psi_enqueue(struct task_struct *p, int flags)
{
int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
+ /* Same runqueue, nothing changed for psi */
+ if (flags & ENQUEUE_RESTORE)
+ return;
+
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
- SCHED_WARN_ON(!migrate);
+ SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
set |= TSK_IOWAIT;
- } else if (migrate) {
+ } else if (flags & ENQUEUE_MIGRATED) {
/* CPU migration of runnable task */
set = TSK_RUNNING;
if (p->in_memstall)
@@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate)
psi_task_change(p, clear, set);
}
-static inline void psi_dequeue(struct task_struct *p, bool migrate)
+static inline void psi_dequeue(struct task_struct *p, int flags)
{
if (static_branch_likely(&psi_disabled))
return;
- /*
- * When migrating a task to another CPU, clear all psi
- * state. The enqueue callback above will work it out.
- */
- if (migrate)
- psi_task_change(p, p->psi_flags, 0);
+ /* Same runqueue, nothing changed for psi */
+ if (flags & DEQUEUE_SAVE)
+ return;
/*
* A voluntary sleep is a dequeue followed by a task switch. To
@@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate)
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
+ if (flags & DEQUEUE_SLEEP)
+ return;
+
+ /*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 24f9f90b6574..0d71fcbaf1e3 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice)
}
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
@@ -713,7 +713,7 @@ change:
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
@@ -1081,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
-/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
-
- return 0;
-}
-
/**
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
@@ -1164,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
#endif
}
- return sched_attr_copy_to_user(uattr, &kattr, usize);
+ kattr.size = min(usize, sizeof(kattr));
+ return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 134d7112ef71..b410b61cec95 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -9,7 +9,7 @@
static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit)
{
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
unsigned long val = (unsigned long)word << shift | bit;
@@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
}
EXPORT_SYMBOL(__wait_on_bit);
-int __sched out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(unsigned long *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
EXPORT_SYMBOL(out_of_line_wait_on_bit);
int __sched out_of_line_wait_on_bit_timeout(
- void *word, int bit, wait_bit_action_f *action,
+ unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode, unsigned long timeout)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry
}
EXPORT_SYMBOL(__wait_on_bit_lock);
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
@@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
EXPORT_SYMBOL(__wake_up_bit);
/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wake_up_bit - wake up waiters on a bit
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
*
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hash-table's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
+ * Wake up any process waiting in wait_on_bit() or similar for the
+ * given bit to be cleared.
*
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address and bit will be woken, and only if the
+ * bit is clear.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the bit is cleared and before this function is called.
+ * If the bit was cleared atomically, such as a by clear_bit() then
+ * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed.
+ * If the bit was cleared with a fully-ordered operation, no further
+ * barrier is required.
+ *
+ * Normally the bit should be cleared by an operation with RELEASE
+ * semantics so that any changes to memory made before the bit is
+ * cleared are guaranteed to be visible after the matching wait_on_bit()
+ * completes.
*/
-void wake_up_bit(void *word, int bit)
+void wake_up_bit(unsigned long *word, int bit)
{
__wake_up_bit(bit_waitqueue(word, bit), word, bit);
}
@@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
}
EXPORT_SYMBOL(init_wait_var_entry);
+/**
+ * wake_up_var - wake up waiters on a variable (kernel address)
+ * @var: the address of the variable being waited on
+ *
+ * Wake up any process waiting in wait_var_event() or similar for the
+ * given variable to change. wait_var_event() can be waiting for an
+ * arbitrary condition to be true and associates that condition with an
+ * address. Calling wake_up_var() suggests that the condition has been
+ * made true, but does not strictly require the condtion to use the
+ * address given.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address will be woken.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the variable is updated (or more accurately, after the
+ * condition waited on has been made to be true) and before this function
+ * is called. If the variable was updated atomically, such as a by
+ * atomic_dec() then smb_mb__after_atomic() can be used. If the
+ * variable was updated by a fully ordered operation such as
+ * atomic_dec_and_test() then no extra barrier is required. Otherwise
+ * smb_mb() is needed.
+ *
+ * Normally the variable should be updated (the condition should be made
+ * to be true) by an operation with RELEASE semantics such as
+ * smp_store_release() so that any changes to memory made before the
+ * variable was updated are guaranteed to be visible after the matching
+ * wait_var_event() completes.
+ */
void wake_up_var(void *var)
{
__wake_up_bit(__var_waitqueue(var), var, -1);
@@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
}
EXPORT_SYMBOL_GPL(bit_wait_timeout);
-__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- io_schedule_timeout(word->timeout - now);
- if (signal_pending_state(mode, current))
- return -EINTR;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
-
void __init wait_bit_init(void)
{
int i;