aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c82
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/ext.c222
-rw-r--r--kernel/sched/fair.c27
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--kernel/sched/stats.h48
-rw-r--r--kernel/sched/syscalls.c13
7 files changed, 230 insertions, 169 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 43e453ab7e20..dbfb5717d6af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
+ * Additionally it is possible to be ->on_rq but still be considered not
+ * runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ * but will be dequeued as soon as they get picked again. See the
+ * task_is_runnable() helper.
+ *
* p->on_cpu <- { 0, 1 }:
*
* is set by prepare_task() and cleared by finish_task() such that it will be
@@ -2012,11 +2017,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE)) {
- sched_info_enqueue(rq, p);
- psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
- }
-
p->sched_class->enqueue_task(rq, p, flags);
/*
* Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
@@ -2024,6 +2024,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p);
+ if (!(flags & ENQUEUE_RESTORE)) {
+ sched_info_enqueue(rq, p);
+ psi_enqueue(p, flags & ENQUEUE_MIGRATED);
+ }
+
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
@@ -2041,7 +2046,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeue(rq, p);
- psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
}
/*
@@ -3518,14 +3523,16 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
- cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
- else
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
+ cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
+ *wake_flags |= WF_RQ_SELECTED;
+ } else {
cpu = cpumask_any(p->cpus_ptr);
+ }
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -3659,6 +3666,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->nr_uninterruptible--;
#ifdef CONFIG_SMP
+ if (wake_flags & WF_RQ_SELECTED)
+ en_flags |= ENQUEUE_RQ_SELECTED;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
@@ -4120,6 +4129,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
guard(preempt)();
int cpu, success = 0;
+ wake_flags |= WF_TTWU;
+
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4252,7 +4263,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
@@ -4317,9 +4328,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
* @arg: Argument to function.
*
* Fix the task in it's current state by avoiding wakeups and or rq operations
- * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
- * to work out what the state is, if required. Given that @func can be invoked
- * with a runqueue lock held, it had better be quite lightweight.
+ * and call @func(@arg) on it. This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required. Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
*
* Returns:
* Whatever @func returns
@@ -4793,6 +4805,7 @@ void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
+ int wake_flags = WF_FORK;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
@@ -4807,7 +4820,7 @@ void wake_up_new_task(struct task_struct *p)
*/
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -4815,7 +4828,7 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
- wakeup_preempt(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, wake_flags);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -6537,6 +6550,7 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
+ bool block = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6622,6 +6636,7 @@ static void __sched notrace __schedule(int sched_mode)
* After this, schedule() must not care about p->state any more.
*/
block_task(rq, prev, flags);
+ block = true;
}
switch_count = &prev->nvcsw;
}
@@ -6667,7 +6682,7 @@ picked:
migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
- psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+ psi_sched_switch(prev, next, block);
trace_sched_switch(preempt, prev, next, prev_state);
@@ -7010,20 +7025,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
-void __setscheduler_prio(struct task_struct *p, int prio)
+const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
{
if (dl_prio(prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(prio))
- p->sched_class = &rt_sched_class;
+ return &dl_sched_class;
+
+ if (rt_prio(prio))
+ return &rt_sched_class;
+
#ifdef CONFIG_SCHED_CLASS_EXT
- else if (task_should_scx(p))
- p->sched_class = &ext_sched_class;
+ if (task_should_scx(p))
+ return &ext_sched_class;
#endif
- else
- p->sched_class = &fair_sched_class;
- p->prio = prio;
+ return &fair_sched_class;
}
#ifdef CONFIG_RT_MUTEXES
@@ -7069,7 +7084,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- const struct sched_class *prev_class;
+ const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
struct rq *rq;
@@ -7127,6 +7142,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
+ next_class = __setscheduler_class(p, prio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -7164,7 +7184,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->rt.timeout = 0;
}
- __setscheduler_prio(p, prio);
+ p->sched_class = next_class;
+ p->prio = prio;
+
check_class_changing(rq, p, prev_class);
if (queued)
@@ -10458,7 +10480,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
return;
if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
return;
- task_work_add(curr, work, TWA_RESUME);
+
+ /* No page allocation under rq lock */
+ task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ce93d0bf452..be1b917dc8ce 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
deadline_queue_push_tasks(rq);
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_dl(rq))
start_hrtick_dl(rq, &p->dl);
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3cd7c50a51c5..5900b06fd036 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,7 +9,6 @@
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
enum scx_consts {
- SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4,
SCX_DSP_DFL_MAX_BATCH = 32,
SCX_DSP_MAX_LOOPS = 32,
SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
@@ -19,6 +18,12 @@ enum scx_consts {
SCX_EXIT_DUMP_DFL_LEN = 32768,
SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_OPS_TASK_ITER_BATCH = 32,
};
enum scx_exit_kind {
@@ -625,6 +630,10 @@ struct sched_ext_ops {
/**
* exit - Clean up after the BPF scheduler
* @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
*/
void (*exit)(struct scx_exit_info *info);
@@ -692,6 +701,7 @@ enum scx_enq_flags {
/* expose select ENQUEUE_* flags as enums */
SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
/* high 32bits are SCX specific */
@@ -1269,86 +1279,104 @@ struct scx_task_iter {
struct task_struct *locked;
struct rq *rq;
struct rq_flags rf;
+ u32 cnt;
};
/**
- * scx_task_iter_init - Initialize a task iterator
+ * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
* @iter: iterator to init
*
- * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
- * @iter must eventually be exited with scx_task_iter_exit().
+ * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
+ * must eventually be stopped with scx_task_iter_stop().
*
- * scx_tasks_lock may be released between this and the first next() call or
- * between any two next() calls. If scx_tasks_lock is released between two
- * next() calls, the caller is responsible for ensuring that the task being
- * iterated remains accessible either through RCU read lock or obtaining a
- * reference count.
+ * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
+ * between this and the first next() call or between any two next() calls. If
+ * the locks are released between two next() calls, the caller is responsible
+ * for ensuring that the task being iterated remains accessible either through
+ * RCU read lock or obtaining a reference count.
*
* All tasks which existed when the iteration started are guaranteed to be
* visited as long as they still exist.
*/
-static void scx_task_iter_init(struct scx_task_iter *iter)
+static void scx_task_iter_start(struct scx_task_iter *iter)
{
- lockdep_assert_held(&scx_tasks_lock);
-
BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+ spin_lock_irq(&scx_tasks_lock);
+
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
iter->locked = NULL;
+ iter->cnt = 0;
+}
+
+static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+{
+ if (iter->locked) {
+ task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+ iter->locked = NULL;
+ }
}
/**
- * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
- * @iter: iterator to unlock rq for
+ * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
+ * @iter: iterator to unlock
*
* If @iter is in the middle of a locked iteration, it may be locking the rq of
- * the task currently being visited. Unlock the rq if so. This function can be
- * safely called anytime during an iteration.
+ * the task currently being visited in addition to scx_tasks_lock. Unlock both.
+ * This function can be safely called anytime during an iteration.
+ */
+static void scx_task_iter_unlock(struct scx_task_iter *iter)
+{
+ __scx_task_iter_rq_unlock(iter);
+ spin_unlock_irq(&scx_tasks_lock);
+}
+
+/**
+ * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
+ * @iter: iterator to re-lock
*
- * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
- * not locking an rq.
+ * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
+ * doesn't re-lock the rq lock. Must be called before other iterator operations.
*/
-static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+static void scx_task_iter_relock(struct scx_task_iter *iter)
{
- if (iter->locked) {
- task_rq_unlock(iter->rq, iter->locked, &iter->rf);
- iter->locked = NULL;
- return true;
- } else {
- return false;
- }
+ spin_lock_irq(&scx_tasks_lock);
}
/**
- * scx_task_iter_exit - Exit a task iterator
+ * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
* @iter: iterator to exit
*
- * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
- * If the iterator holds a task's rq lock, that rq lock is released. See
- * scx_task_iter_init() for details.
+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
+ * which is released on return. If the iterator holds a task's rq lock, that rq
+ * lock is also released. See scx_task_iter_start() for details.
*/
-static void scx_task_iter_exit(struct scx_task_iter *iter)
+static void scx_task_iter_stop(struct scx_task_iter *iter)
{
- lockdep_assert_held(&scx_tasks_lock);
-
- scx_task_iter_rq_unlock(iter);
list_del_init(&iter->cursor.tasks_node);
+ scx_task_iter_unlock(iter);
}
/**
* scx_task_iter_next - Next task
* @iter: iterator to walk
*
- * Visit the next task. See scx_task_iter_init() for details.
+ * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
+ * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
+ * stalls by holding scx_tasks_lock for too long.
*/
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- lockdep_assert_held(&scx_tasks_lock);
+ if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ scx_task_iter_unlock(iter);
+ cond_resched();
+ scx_task_iter_relock(iter);
+ }
list_for_each_entry(pos, cursor, tasks_node) {
if (&pos->tasks_node == &scx_tasks)
@@ -1369,14 +1397,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
* @include_dead: Whether we should include dead tasks in the iteration
*
* Visit the non-idle task with its rq lock held. Allows callers to specify
- * whether they would like to filter out dead tasks. See scx_task_iter_init()
+ * whether they would like to filter out dead tasks. See scx_task_iter_start()
* for details.
*/
static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
{
struct task_struct *p;
- scx_task_iter_rq_unlock(iter);
+ __scx_task_iter_rq_unlock(iter);
while ((p = scx_task_iter_next(iter))) {
/*
@@ -1944,7 +1972,6 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
- bool bypassing = scx_rq_bypassing(rq);
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -1962,7 +1989,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (bypassing)
+ if (scx_rq_bypassing(rq))
goto global;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2017,7 +2044,7 @@ local_norefill:
global:
touch_core_sched(rq, p); /* see the comment in local: */
- p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL;
+ p->scx.slice = SCX_SLICE_DFL;
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
@@ -2953,8 +2980,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)
if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
- printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
- p->comm, p->pid);
+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
+ p->comm, p->pid, __func__);
scx_warned_zero_slice = true;
}
p->scx.slice = SCX_SLICE_DFL;
@@ -3059,11 +3086,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
*found = false;
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
- return prev_cpu;
- }
-
/*
* If WAKE_SYNC, the waker's local DSQ is empty, and the system is
* under utilized, wake up @p to the local DSQ of the waker. Checking
@@ -3128,7 +3150,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
if (unlikely(wake_flags & WF_EXEC))
return prev_cpu;
- if (SCX_HAS_OP(select_cpu)) {
+ if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3193,7 +3215,7 @@ void __scx_update_idle(struct rq *rq, bool idle)
{
int cpu = cpu_of(rq);
- if (SCX_HAS_OP(update_idle)) {
+ if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
if (!static_branch_unlikely(&scx_builtin_idle_enabled))
return;
@@ -4048,7 +4070,6 @@ static void scx_cgroup_exit(void)
percpu_rwsem_assert_held(&scx_cgroup_rwsem);
- WARN_ON_ONCE(!scx_cgroup_enabled);
scx_cgroup_enabled = false;
/*
@@ -4117,6 +4138,7 @@ static int scx_cgroup_init(void)
css->cgroup, &args);
if (ret) {
css_put(css);
+ scx_ops_error("ops.cgroup_init() failed (%d)", ret);
return ret;
}
tg->scx_flags |= SCX_TG_INITED;
@@ -4256,21 +4278,23 @@ bool task_should_scx(struct task_struct *p)
* the DISABLING state and then cycling the queued tasks through dequeue/enqueue
* to force global FIFO scheduling.
*
- * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
- * %SCX_OPS_ENQ_LAST is also ignored.
+ * - ops.select_cpu() is ignored and the default select_cpu() is used.
+ *
+ * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ * %SCX_OPS_ENQ_LAST is also ignored.
*
- * b. ops.dispatch() is ignored.
+ * - ops.dispatch() is ignored.
*
- * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
- * can't be trusted. Whenever a tick triggers, the running task is rotated to
- * the tail of the queue with core_sched_at touched.
+ * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
+ * can't be trusted. Whenever a tick triggers, the running task is rotated to
+ * the tail of the queue with core_sched_at touched.
*
- * d. pick_next_task() suppresses zero slice warning.
+ * - pick_next_task() suppresses zero slice warning.
*
- * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
- * operations.
+ * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * operations.
*
- * f. scx_prio_less() reverts to the default core_sched_at order.
+ * - scx_prio_less() reverts to the default core_sched_at order.
*/
static void scx_ops_bypass(bool bypass)
{
@@ -4340,7 +4364,7 @@ static void scx_ops_bypass(bool bypass)
rq_unlock_irqrestore(rq, &rf);
- /* kick to restore ticks */
+ /* resched to restore ticks and idle state */
resched_cpu(cpu);
}
}
@@ -4462,16 +4486,14 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_ops_init_task_enabled = false;
- spin_lock_irq(&scx_tasks_lock);
- scx_task_iter_init(&sti);
+ scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
struct sched_enq_and_set_ctx ctx;
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
- __setscheduler_prio(p, p->prio);
+ p->sched_class = __setscheduler_class(p, p->prio);
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
@@ -4479,8 +4501,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
check_class_changed(task_rq(p), p, old_class, p->prio);
scx_ops_exit_task(p);
}
- scx_task_iter_exit(&sti);
- spin_unlock_irq(&scx_tasks_lock);
+ scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
/* no task is on scx, turn off all the switches and flush in-progress calls */
@@ -5041,6 +5062,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret) {
ret = ops_sanitize_err("init", ret);
cpus_read_unlock();
+ scx_ops_error("ops.init() failed (%d)", ret);
goto err_disable;
}
}
@@ -5130,8 +5152,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ret)
goto err_disable_unlock_all;
- spin_lock_irq(&scx_tasks_lock);
- scx_task_iter_init(&sti);
+ scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
/*
* @p may already be dead, have lost all its usages counts and
@@ -5141,27 +5162,24 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!tryget_task_struct(p))
continue;
- scx_task_iter_rq_unlock(&sti);
- spin_unlock_irq(&scx_tasks_lock);
+ scx_task_iter_unlock(&sti);
ret = scx_ops_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
- spin_lock_irq(&scx_tasks_lock);
- scx_task_iter_exit(&sti);
- spin_unlock_irq(&scx_tasks_lock);
- pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
- ret, p->comm, p->pid);
+ scx_task_iter_relock(&sti);
+ scx_task_iter_stop(&sti);
+ scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
goto err_disable_unlock_all;
}
scx_set_task_state(p, SCX_TASK_READY);
put_task_struct(p);
- spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_relock(&sti);
}
- scx_task_iter_exit(&sti);
- spin_unlock_irq(&scx_tasks_lock);
+ scx_task_iter_stop(&sti);
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
@@ -5178,35 +5196,28 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* scx_tasks_lock.
*/
percpu_down_write(&scx_fork_rwsem);
- spin_lock_irq(&scx_tasks_lock);
- scx_task_iter_init(&sti);
+ scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
struct sched_enq_and_set_ctx ctx;
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- __setscheduler_prio(p, p->prio);
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = __setscheduler_class(p, p->prio);
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
}
- scx_task_iter_exit(&sti);
- spin_unlock_irq(&scx_tasks_lock);
+ scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
scx_ops_bypass(false);
- /*
- * Returning an error code here would lose the recorded error
- * information. Exit indicating success so that the error is notified
- * through ops.exit() with all the details.
- */
if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
- ret = 0;
goto err_disable;
}
@@ -5241,10 +5252,18 @@ err_disable_unlock_all:
scx_ops_bypass(false);
err_disable:
mutex_unlock(&scx_ops_enable_mutex);
- /* must be fully disabled before returning */
- scx_ops_disable(SCX_EXIT_ERROR);
+ /*
+ * Returning an error code here would not pass all the error information
+ * to userspace. Record errno using scx_ops_error() for cases
+ * scx_ops_error() wasn't already invoked and exit indicating success so
+ * that the error is notified through ops.exit() with all the details.
+ *
+ * Flush scx_ops_disable_work to ensure that error is reported before
+ * init completion.
+ */
+ scx_ops_error("scx_ops_enable() failed (%d)", ret);
kthread_flush_work(&scx_ops_disable_work);
- return ret;
+ return 0;
}
@@ -5864,16 +5883,21 @@ __bpf_kfunc_start_defs();
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
- *is_idle = false;
- return prev_cpu;
+ if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+ scx_ops_error("built-in idle tracking is disabled");
+ goto prev_cpu;
}
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+ goto prev_cpu;
+
#ifdef CONFIG_SMP
return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-#else
+#endif
+
+prev_cpu:
*is_idle = false;
return prev_cpu;
-#endif
}
__bpf_kfunc_end_defs();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 225b31aaee55..c157d4860a3b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
- if (rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
@@ -6058,10 +6058,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (se->on_rq) {
- SCHED_WARN_ON(se->sched_delayed);
+ /* Handle any unfinished DELAY_DEQUEUE business first. */
+ if (se->sched_delayed) {
+ int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
+
+ dequeue_entity(qcfs_rq, se, flags);
+ } else if (se->on_rq)
break;
- }
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -13174,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
- /*
- * Since this is called after changing class, this is a little weird
- * and we cannot use DEQUEUE_DELAYED.
- */
- if (p->se.sched_delayed) {
- /* First, dequeue it from its new class' structures */
- dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
- /*
- * Now, clean up the fair_sched_class side of things
- * related to sched_delayed being true and that wasn't done
- * due to the generic dequeue not using DEQUEUE_DELAYED.
- */
- finish_delayed_dequeue_entity(&p->se);
- p->se.rel_deadline = 0;
- __block_task(rq, p);
- }
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..081519ffab46 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
+#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40];
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
+ * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
*/
@@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
#define ENQUEUE_DELAYED 0x200
+#define ENQUEUE_RQ_SELECTED 0x400
#define RETRY_TASK ((void *)-1UL)
@@ -3797,7 +3800,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-extern void __setscheduler_prio(struct task_struct *p, int prio);
+extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..767e098a3bd1 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
- * where a task's runnable state changes, and requeues, where a task
- * and its state are being moved between CPUs and runqueues.
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
*/
-static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+static inline void psi_enqueue(struct task_struct *p, bool migrate)
{
- int clear = 0, set = TSK_RUNNING;
+ int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
- if (p->in_memstall)
- set |= TSK_MEMSTALL_RUNNING;
-
- if (!wakeup) {
+ if (p->se.sched_delayed) {
+ /* CPU migration of "sleeping" task */
+ SCHED_WARN_ON(!migrate);
if (p->in_memstall)
set |= TSK_MEMSTALL;
+ if (p->in_iowait)
+ set |= TSK_IOWAIT;
+ } else if (migrate) {
+ /* CPU migration of runnable task */
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
} else {
+ /* Wakeup of new or sleeping task */
if (p->in_iowait)
clear |= TSK_IOWAIT;
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
}
psi_task_change(p, clear, set);
}
-static inline void psi_dequeue(struct task_struct *p, bool sleep)
+static inline void psi_dequeue(struct task_struct *p, bool migrate)
{
if (static_branch_likely(&psi_disabled))
return;
/*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ if (migrate)
+ psi_task_change(p, p->psi_flags, 0);
+
+ /*
* A voluntary sleep is a dequeue followed by a task switch. To
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
- if (sleep)
- return;
-
- psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
}
#else /* CONFIG_PSI */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
-static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index aa70beee9895..0470bcc3d204 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p,
{
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
- const struct sched_class *prev_class;
+ const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
@@ -706,6 +706,12 @@ change:
queue_flags &= ~DEQUEUE_MOVE;
}
+ prev_class = p->sched_class;
+ next_class = __setscheduler_class(p, newprio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -713,11 +719,10 @@ change:
if (running)
put_prev_task(rq, p);
- prev_class = p->sched_class;
-
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
- __setscheduler_prio(p, newprio);
+ p->sched_class = next_class;
+ p->prio = newprio;
}
__setscheduler_uclamp(p, attr);
check_class_changing(rq, p, prev_class);