aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c1239
1 files changed, 602 insertions, 637 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c52c2eba7c73..a708d225c28e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -85,7 +85,6 @@
#include "sched.h"
#include "stats.h"
-#include "autogroup.h"
#include "autogroup.h"
#include "pelt.h"
@@ -114,6 +113,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
- break;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
@@ -1097,25 +1096,22 @@ int get_nohz_timer_target(void)
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
- rcu_read_lock();
+ guard(rcu)();
+
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ if (!idle_cpu(i))
+ return i;
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
- cpu = default_cpu;
-unlock:
- rcu_read_unlock();
- return cpu;
+
+ return default_cpu;
}
/*
@@ -1194,6 +1190,20 @@ static void nohz_csd_func(void *info)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -1229,6 +1239,18 @@ bool sched_can_stop_tick(struct rq *rq)
if (rq->nr_running > 1)
return false;
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -1457,16 +1479,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p)
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
- struct rq_flags rf;
- struct rq *rq;
-
if (!rt_task(p))
return;
/* Protect updates to p->uclamp_* */
- rq = task_rq_lock(p, &rf);
+ guard(task_rq_lock)(p);
__uclamp_update_util_min_rt_default(p);
- task_rq_unlock(rq, p, &rf);
}
static inline struct uclamp_se
@@ -1762,9 +1780,8 @@ static void uclamp_update_root_tg(void)
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
- rcu_read_lock();
+ guard(rcu)();
cpu_util_update_eff(&root_task_group.css);
- rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
@@ -1791,10 +1808,9 @@ static void uclamp_sync_util_min_rt_default(void)
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
- rcu_read_lock();
+ guard(rcu)();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
}
static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@ -1804,7 +1820,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
int old_min, old_max, old_min_rt;
int result;
- mutex_lock(&uclamp_mutex);
+ guard(mutex)(&uclamp_mutex);
+
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
@@ -1813,7 +1830,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (result)
goto undo;
if (!write)
- goto done;
+ return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
@@ -1849,16 +1866,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
-
- goto done;
+ return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
-done:
- mutex_unlock(&uclamp_mutex);
-
return result;
}
#endif
@@ -2198,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio);
}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ rq->curr->sched_class->wakeup_preempt(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
@@ -2219,31 +2232,21 @@ int __task_state_match(struct task_struct *p, unsigned int state)
if (READ_ONCE(p->__state) & state)
return 1;
-#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
-#endif
+
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
-#ifdef CONFIG_PREEMPT_RT
- int match;
-
/*
- * Serialize against current_save_and_set_rtlock_wait_state() and
- * current_restore_rtlock_saved_state().
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
*/
- raw_spin_lock_irq(&p->pi_lock);
- match = __task_state_match(p, state);
- raw_spin_unlock_irq(&p->pi_lock);
-
- return match;
-#else
+ guard(raw_spinlock_irq)(&p->pi_lock);
return __task_state_match(p, state);
-#endif
}
/*
@@ -2397,10 +2400,9 @@ void migrate_disable(void)
return;
}
- preempt_disable();
+ guard(preempt)();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -2424,7 +2426,7 @@ void migrate_enable(void)
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
- preempt_disable();
+ guard(preempt)();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
/*
@@ -2435,7 +2437,6 @@ void migrate_enable(void)
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -2507,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq_lock(rq, rf);
WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
return rq;
}
@@ -2644,9 +2645,11 @@ static int migration_cpu_stop(void *data)
* it.
*/
WARN_ON_ONCE(!pending->stop_pending);
+ preempt_disable();
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
+ preempt_enable();
return 0;
}
out:
@@ -2966,12 +2969,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
complete = true;
}
+ preempt_disable();
task_rq_unlock(rq, p, rf);
-
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
+ preempt_enable();
if (complete)
complete_all(&pending->done);
@@ -3037,12 +3041,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
+ preempt_disable();
task_rq_unlock(rq, p, rf);
-
if (!stop_pending) {
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
}
+ preempt_enable();
if (flags & SCA_MIGRATE_ENABLE)
return 0;
@@ -3389,7 +3394,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
+ wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
@@ -3413,7 +3418,6 @@ static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
@@ -3421,33 +3425,25 @@ static int migrate_swap_stop(void *data)
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
+ return 0;
}
/*
@@ -3505,13 +3501,11 @@ out:
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
@@ -3722,14 +3716,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
struct sched_domain *sd;
__schedstat_inc(p->stats.nr_wakeups_remote);
- rcu_read_lock();
+
+ guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
@@ -3774,7 +3768,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
}
activate_task(rq, p, en_flags);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
ttwu_do_wakeup(p);
@@ -3798,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (rq->avg_idle > max)
rq->avg_idle = max;
- rq->wake_stamp = jiffies;
- rq->wake_avg_idle = rq->avg_idle / 2;
-
rq->idle_stamp = 0;
}
#endif
@@ -3845,7 +3836,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
* it should preempt the task that is current now.
*/
update_rq_clock(rq);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
ret = 1;
@@ -3928,21 +3919,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
-
- rcu_read_lock();
-
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
- rq_lock_irqsave(rq, &rf);
- if (is_idle_task(rq->curr))
- resched_curr(rq);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
-
-out:
- rcu_read_unlock();
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -3953,6 +3936,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
@@ -4033,13 +4028,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* The caller holds p::pi_lock if p != current or has preemption
* disabled when p == current.
*
- * The rules of PREEMPT_RT saved_state:
+ * The rules of saved_state:
*
* The related locking code always holds p::pi_lock when updating
* p::saved_state, which means the code is fully serialized in both cases.
*
- * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- * bits set. This allows to distinguish all wakeup scenarios.
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
*/
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
@@ -4053,13 +4052,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*success = !!(match = __task_state_match(p, state));
-#ifdef CONFIG_PREEMPT_RT
/*
* Saved state preserves the task state across blocking on
- * an RT lock. If the state matches, set p::saved_state to
- * TASK_RUNNING, but do not wake the task because it waits
- * for a lock wakeup. Also indicate success because from
- * the regular waker's point of view this has succeeded.
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
*
* After acquiring the lock the task will restore p::__state
* from p::saved_state which ensures that the regular
@@ -4069,7 +4068,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*/
if (match < 0)
p->saved_state = TASK_RUNNING;
-#endif
+
return match > 0;
}
@@ -4193,13 +4192,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
- preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4226,129 +4223,127 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- smp_mb__after_spinlock();
- if (!ttwu_state_match(p, state, &success))
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- trace_sched_waking(p);
+ trace_sched_waking(p);
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * STORE p->on_rq = 1 LOAD p->state
- * UNLOCK rq->lock
- *
- * __schedule() (switch to task 'p')
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * UNLOCK rq->lock
- *
- * [task p]
- * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
- */
- smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
- goto unlock;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smp_rmb() lives in __task_needs_rq_lock().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
#ifdef CONFIG_SMP
- /*
- * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- * possible to, falsely, observe p->on_cpu == 0.
- *
- * One must be running (->on_cpu == 1) in order to remove oneself
- * from the runqueue.
- *
- * __schedule() (switch to task 'p') try_to_wake_up()
- * STORE p->on_cpu = 1 LOAD p->on_rq
- * UNLOCK rq->lock
- *
- * __schedule() (put 'p' to sleep)
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * STORE p->on_rq = 0 LOAD p->on_cpu
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
- */
- smp_acquire__after_ctrl_dep();
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- /*
- * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
- * == 0), which means we need to do an enqueue, change p->state to
- * TASK_WAKING such that we can unlock p->pi_lock before doing the
- * enqueue, such as ttwu_queue_wakelist().
- */
- WRITE_ONCE(p->__state, TASK_WAKING);
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, considering queueing p on the remote CPUs wake_list
- * which potentially sends an IPI instead of spinning on p->on_cpu to
- * let the waker make forward progress. This is safe because IRQs are
- * disabled and the IPI will deliver after on_cpu is cleared.
- *
- * Ensure we load task_cpu(p) after p->on_cpu:
- *
- * set_task_cpu(p, cpu);
- * STORE p->cpu = @cpu
- * __schedule() (switch to task 'p')
- * LOCK rq->lock
- * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
- * STORE p->on_cpu = 1 LOAD p->cpu
- *
- * to ensure we observe the correct CPU on which the task is currently
- * scheduling.
- */
- if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
- goto unlock;
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until it's done referencing the task.
- *
- * Pairs with the smp_store_release() in finish_task().
- *
- * This ensures that tasks getting woken will be fully ordered against
- * their previous state and preserve Program Order.
- */
- smp_cond_load_acquire(&p->on_cpu, !VAL);
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
- if (task_cpu(p) != cpu) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
+ cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
- wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
#else
- cpu = task_cpu(p);
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
- ttwu_queue(p, cpu, wake_flags);
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ ttwu_queue(p, cpu, wake_flags);
+ }
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
- preempt_enable();
return success;
}
@@ -4501,6 +4496,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
+ p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4870,7 +4867,7 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -5373,8 +5370,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
/* switch_mm_cid() requires the memory barriers above. */
switch_mm_cid(rq, prev, next);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
@@ -5496,23 +5491,20 @@ unsigned int nr_iowait(void)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
@@ -5722,9 +5714,6 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
- struct rq_flags rf;
- u64 delta;
int os;
/*
@@ -5734,30 +5723,26 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!tick_nohz_tick_stopped_cpu(cpu))
- goto out_requeue;
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- rq_lock_irq(rq, &rf);
- curr = rq->curr;
- if (cpu_is_offline(cpu))
- goto out_unlock;
+ if (cpu_online(cpu)) {
+ update_rq_clock(rq);
- update_rq_clock(rq);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
- if (!is_idle_task(curr)) {
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ calc_load_nohz_remote(rq);
+ }
}
- curr->sched_class->task_tick(rq, curr, 0);
-
- calc_load_nohz_remote(rq);
-out_unlock:
- rq_unlock_irq(rq, &rf);
-out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
@@ -5925,8 +5910,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
@@ -6306,19 +6290,19 @@ static bool try_steal_cookie(int this, int that)
unsigned long cookie;
bool success = false;
- local_irq_disable();
- double_rq_lock(dst, src);
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
cookie = dst->core->core_cookie;
if (!cookie)
- goto unlock;
+ return false;
if (dst->curr != dst->idle)
- goto unlock;
+ return false;
p = sched_core_find(src, cookie);
if (!p)
- goto unlock;
+ return false;
do {
if (p == src->core_pick || p == src->curr)
@@ -6330,9 +6314,10 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
/*
- * sched_core_find() and sched_core_next() will ensure that task @p
- * is not throttled now, we also need to check whether the runqueue
- * of the destination CPU is being throttled.
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
*/
if (sched_task_is_throttled(p, this))
goto next;
@@ -6350,10 +6335,6 @@ next:
p = sched_core_next(p, cookie);
} while (p);
-unlock:
- double_rq_unlock(dst, src);
- local_irq_enable();
-
return success;
}
@@ -6380,8 +6361,9 @@ static void sched_core_balance(struct rq *rq)
struct sched_domain *sd;
int cpu = cpu_of(rq);
- preempt_disable();
- rcu_read_lock();
+ guard(preempt)();
+ guard(rcu)();
+
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
@@ -6391,8 +6373,6 @@ static void sched_core_balance(struct rq *rq)
break;
}
raw_spin_rq_lock_irq(rq);
- rcu_read_unlock();
- preempt_enable();
}
static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@ -6411,20 +6391,24 @@ static void queue_core_balance(struct rq *rq)
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
WARN_ON_ONCE(rq->core != rq);
/* if we're the first, we'll be our own leader */
if (cpumask_weight(smt_mask) == 1)
- goto unlock;
+ return;
/* find the leader */
for_each_cpu(t, smt_mask) {
@@ -6438,7 +6422,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
- goto unlock;
+ return;
/* install and validate core_rq */
for_each_cpu(t, smt_mask) {
@@ -6449,29 +6433,25 @@ static void sched_core_cpu_starting(unsigned int cpu)
WARN_ON_ONCE(rq->core != core_rq);
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
/* if we're the last man standing, nothing to do */
if (cpumask_weight(smt_mask) == 1) {
WARN_ON_ONCE(rq->core != rq);
- goto unlock;
+ return;
}
/* if we're not the leader, nothing to do */
if (rq->core != rq)
- goto unlock;
+ return;
/* find a new leader */
for_each_cpu(t, smt_mask) {
@@ -6482,7 +6462,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* impossible */
- goto unlock;
+ return;
/* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq;
@@ -6504,9 +6484,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
rq = cpu_rq(t);
rq->core = core_rq;
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static inline void sched_core_cpu_dying(unsigned int cpu)
@@ -6630,6 +6607,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
@@ -6709,8 +6687,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
@@ -6735,22 +6711,24 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
+ static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
unsigned int task_flags;
- if (task_is_running(tsk))
- return;
+ /*
+ * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ * will use a blocking primitive -- which would lead to recursion.
+ */
+ lock_map_acquire_try(&sched_map);
task_flags = tsk->flags;
/*
* If a worker goes to sleep, notify and ask workqueue whether it
* wants to wake up a task to maintain concurrency.
*/
- if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- if (task_flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- }
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
/*
* spinlock and rwlock must not flush block requests. This will
@@ -6764,6 +6742,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
* make sure to submit it to avoid deadlocks.
*/
blk_flush_plug(tsk->plug, true);
+
+ lock_map_release(&sched_map);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6776,16 +6756,26 @@ static void sched_update_worker(struct task_struct *tsk)
}
}
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
{
- struct task_struct *tsk = current;
-
- sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(SM_NONE);
+ __schedule(sched_mode);
sched_preempt_enable_no_resched();
} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+ lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+ if (!task_is_running(tsk))
+ sched_submit_work(tsk);
+ __schedule_loop(SM_NONE);
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -6849,11 +6839,7 @@ void __sched schedule_preempt_disabled(void)
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
- do {
- preempt_disable();
- __schedule(SM_RTLOCK_WAIT);
- sched_preempt_enable_no_resched();
- } while (need_resched());
+ __schedule_loop(SM_RTLOCK_WAIT);
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif
@@ -7030,7 +7016,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7049,6 +7035,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
#ifdef CONFIG_RT_MUTEXES
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+ lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+ lockdep_assert(current->sched_rt_mutex);
+ __schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+ sched_update_worker(current);
+ lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
if (pi_task)
@@ -7202,9 +7214,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio;
- struct rq_flags rf;
struct rq *rq;
+ int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
@@ -7212,7 +7223,9 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &rf);
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
update_rq_clock(rq);
/*
@@ -7223,8 +7236,9 @@ void set_user_nice(struct task_struct *p, long nice)
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
+ return;
}
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -7247,9 +7261,6 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
- task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
@@ -7383,6 +7394,19 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
@@ -7509,6 +7533,21 @@ static struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
@@ -7546,14 +7585,11 @@ static void __setscheduler_params(struct task_struct *p,
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
- bool match;
+ guard(rcu)();
- rcu_read_lock();
pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
}
/*
@@ -7965,27 +8001,17 @@ static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- struct task_struct *p;
- int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
- }
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- return retval;
+ return sched_setscheduler(p, policy, &lparam);
}
/*
@@ -8081,7 +8107,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
- struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
@@ -8096,21 +8121,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- if (likely(p)) {
- if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
- retval = sched_setattr(p, &attr);
- put_task_struct(p);
- }
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
- return retval;
+ return sched_setattr(p, &attr);
}
/**
@@ -8128,16 +8146,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
+ guard(rcu)();
p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
}
- rcu_read_unlock();
return retval;
}
@@ -8158,30 +8177,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (!param || pid < 0)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
/*
@@ -8241,46 +8253,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
- kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
- /*
- * This could race with another potential updater, but this is fine
- * because it'll correctly read the old or the new value. We don't need
- * to guarantee who wins the race as long as it doesn't return garbage.
- */
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
-
- rcu_read_unlock();
+ }
return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
- int ret = 0;
-
/*
* If the task isn't a deadline task or admission control is
* disabled then we don't care about affinity changes.
@@ -8294,11 +8298,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
- rcu_read_lock();
+ guard(rcu)();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
- ret = -EBUSY;
- rcu_read_unlock();
- return ret;
+ return -EBUSY;
+
+ return 0;
}
#endif
@@ -8368,39 +8372,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct affinity_context ac;
struct cpumask *user_mask;
- struct task_struct *p;
int retval;
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
return -ESRCH;
- }
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
-
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- retval = -EPERM;
- goto out_put_task;
- }
- rcu_read_unlock();
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_put_task;
+ return retval;
/*
* With non-SMP configs, user_cpus_ptr/user_mask isn't used and
@@ -8410,8 +8399,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
- retval = -ENOMEM;
- goto out_put_task;
+ return -ENOMEM;
}
ac = (struct affinity_context){
@@ -8423,8 +8411,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
-out_put_task:
- put_task_struct(p);
return retval;
}
@@ -8466,28 +8452,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
- unsigned long flags;
int retval;
- rcu_read_lock();
-
- retval = -ESRCH;
+ guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
- goto out_unlock;
+ return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
- goto out_unlock;
+ return retval;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-out_unlock:
- rcu_read_unlock();
-
- return retval;
+ return 0;
}
/**
@@ -8934,55 +8913,46 @@ int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
- unsigned long flags;
int yielded = 0;
- local_irq_save(flags);
- rq = this_rq();
+ scoped_guard (irqsave) {
+ rq = this_rq();
again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
- }
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
+ if (!curr->sched_class->yield_to_task)
+ return 0;
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
+ if (curr->sched_class != p->sched_class)
+ return 0;
- if (task_on_cpu(p_rq, p) || !task_is_running(p))
- goto out_unlock;
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
- yielded = curr->sched_class->yield_to_task(rq, p);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
}
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
-
- if (yielded > 0)
+ if (yielded)
schedule();
return yielded;
@@ -9085,38 +9055,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
- struct task_struct *p;
- unsigned int time_slice;
- struct rq_flags rf;
- struct rq *rq;
+ unsigned int time_slice = 0;
int retval;
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &rf);
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
- rcu_read_unlock();
jiffies_to_timespec64(time_slice, t);
return 0;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
/**
@@ -9175,9 +9137,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
- free, task_pid_nr(p), ppid,
- read_task_thread_flags(p));
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ free, task_pid_nr(p), task_tgid_nr(p),
+ ppid, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -9271,7 +9233,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
* PF_KTHREAD should already be set at this point; regardless, make it
* look like a proper per-CPU kthread.
*/
- idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+ idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
#ifdef CONFIG_SMP
@@ -9507,9 +9469,11 @@ static void balance_push(struct rq *rq)
* Temporarily drop rq->lock such that we can wake-up the stop task.
* Both preemption and IRQs are still disabled.
*/
+ preempt_disable();
raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
+ preempt_enable();
/*
* At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task
@@ -9905,7 +9869,7 @@ struct task_group root_task_group;
LIST_HEAD(task_groups);
/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
+static struct kmem_cache *task_group_cache __ro_after_init;
#endif
void __init sched_init(void)
@@ -9940,7 +9904,7 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -10015,7 +9979,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -10024,8 +9988,6 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- rq->wake_stamp = jiffies;
- rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -10291,9 +10253,9 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for kdb.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -10315,30 +10277,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -10500,17 +10439,18 @@ void sched_move_task(struct task_struct *tsk)
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct task_group *group;
- struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &rf);
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
+
/*
* Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
* group changes.
*/
group = sched_get_task_group(tsk);
if (group == tsk->sched_task_group)
- goto unlock;
+ return;
update_rq_clock(rq);
@@ -10535,9 +10475,6 @@ void sched_move_task(struct task_struct *tsk)
*/
resched_curr(rq);
}
-
-unlock:
- task_rq_unlock(rq, tsk, &rf);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10574,11 +10511,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
cpu_util_update_eff(css);
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
#endif
return 0;
@@ -10729,8 +10664,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
static_branch_enable(&sched_uclamp_used);
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
@@ -10745,9 +10680,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
/* Update effective clamps to track the most restrictive value */
cpu_util_update_eff(of_css(of));
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
-
return nbytes;
}
@@ -10773,10 +10705,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
u64 percent;
u32 rem;
- rcu_read_lock();
- tg = css_tg(seq_css(sf));
- util_clamp = tg->uclamp_req[clamp_id].value;
- rcu_read_unlock();
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ }
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
@@ -10867,11 +10799,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- cpus_read_lock();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -10881,39 +10814,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- cfs_b->burst = burst;
- __refill_cfs_bandwidth_runtime(cfs_b);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
- /* Restart the period timer (if active) to handle new period expiry: */
- if (runtime_enabled)
- start_cfs_bandwidth(cfs_b);
+ __refill_cfs_bandwidth_runtime(cfs_b);
- raw_spin_unlock_irq(&cfs_b->lock);
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+ }
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- rq_unlock_irq(rq, &rf);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- cpus_read_unlock();
- return ret;
+ return 0;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -11074,11 +11006,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -11093,7 +11030,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -11105,11 +11041,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -11139,6 +11072,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11215,6 +11169,10 @@ static struct cftype cpu_legacy_files[] = {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -11271,6 +11229,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
return 0;
}
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -11449,6 +11425,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
@@ -11670,14 +11647,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
* are not the last task to be migrated from this cpu for this mm, so
* there is no need to move src_cid to the destination cpu.
*/
- rcu_read_lock();
+ guard(rcu)();
src_task = rcu_dereference(src_rq->curr);
if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
t->last_mm_cid = -1;
return -1;
}
- rcu_read_unlock();
return src_cid;
}
@@ -11721,18 +11696,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
* the lazy-put flag, this task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
+ scoped_guard (rcu) {
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
}
- rcu_read_unlock();
/*
* The src_cid is unused, so it can be unset.
@@ -11805,7 +11779,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
- unsigned long flags;
int cid, lazy_cid;
cid = READ_ONCE(pcpu_cid->cid);
@@ -11840,23 +11813,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
* the lazy-put flag, that task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ return;
}
- rcu_read_unlock();
/*
* The cid is unused, so it can be unset.
* Disable interrupts to keep the window of cid ownership without rq
* lock small.
*/
- local_irq_save(flags);
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
- local_irq_restore(flags);
+ scoped_guard (irqsave) {
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ }
}
static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
@@ -11878,14 +11849,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
* snapshot associated with this cid if an active task using the mm is
* observed on this rq.
*/
- rcu_read_lock();
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ return;
+ }
}
- rcu_read_unlock();
if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
return;
@@ -11979,7 +11949,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -11987,7 +11956,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
@@ -11997,13 +11966,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12011,7 +11978,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
@@ -12021,13 +11988,11 @@ void sched_mm_cid_before_execve(struct task_struct *t)
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12035,16 +12000,16 @@ void sched_mm_cid_after_execve(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
- rq_unlock_irqrestore(rq, &rf);
+ scoped_guard (rq_lock_irqsave, rq) {
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ }
rseq_set_notify_resume(t);
}