aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c188
1 files changed, 131 insertions, 57 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e696b03e99d..ead464a0f2e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,7 +24,7 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
/*
* Debugging: various feature bits
*
@@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
- * If we observe the old CPU in task_rq_lock, the acquire of
+ * If we observe the old CPU in task_rq_lock(), the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new CPU in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
@@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
+ update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
@@ -396,7 +398,7 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
@@ -405,19 +407,60 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* its already queued (either by us or someone else) and will get the
* wakeup due to that.
*
- * This cmpxchg() executes a full barrier, which pairs with the full
- * barrier executed by the wakeup in wake_up_q().
+ * In order to ensure that a pending wakeup will observe our pending
+ * state, even in the failed case, an explicit smp_mb() must be used.
*/
- if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
- return;
-
- get_task_struct(task);
+ smp_mb__before_atomic();
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
+ return false;
/*
* The head is context local, there can be no concurrency.
*/
*head->lastp = node;
head->lastp = &node->next;
+ return true;
+}
+
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task))
+ get_task_struct(task);
+}
+
+/**
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ *
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
+ * that already hold reference to @task can call the 'safe' version and trust
+ * wake_q to do the right thing depending whether or not the @task is already
+ * queued for wakeup.
+ */
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
+{
+ if (!__wake_q_add(head, task))
+ put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
@@ -697,7 +740,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
/*
* SCHED_IDLE tasks get minimal weight:
*/
- if (idle_policy(p->policy)) {
+ if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
p->se.runnable_weight = load->weight;
@@ -722,8 +765,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE))
+ if (!(flags & ENQUEUE_RESTORE)) {
sched_info_queued(rq, p);
+ psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ }
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -733,8 +778,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE))
+ if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeued(rq, p);
+ psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ }
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -911,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -2037,6 +2084,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
@@ -2172,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
@@ -2413,7 +2464,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
- post_init_entity_util_avg(&p->se);
+ post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2852,7 +2903,7 @@ unsigned long nr_running(void)
* preemption, thus the result might have a time-of-check-to-time-of-use
* race. The caller is responsible to use it correctly, for example:
*
- * - from a non-preemptable section (of course)
+ * - from a non-preemptible section (of course)
*
* - from a thread that is bound to a single CPU
*
@@ -2876,6 +2927,18 @@ unsigned long long nr_context_switches(void)
}
/*
+ * Consumers of these two interfaces, like for example the cpuidle menu
+ * governor, are using nonsensical data. Preferring shallow idle state selection
+ * for a CPU that has IO-wait which might not even end up running the task when
+ * it does become runnable.
+ */
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
+}
+
+/*
* IO-wait accounting, and how its mostly bollocks (on SMP).
*
* The idea behind IO-wait account is to account the idle time that we could
@@ -2910,31 +2973,11 @@ unsigned long nr_iowait(void)
unsigned long i, sum = 0;
for_each_possible_cpu(i)
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
+ sum += nr_iowait_cpu(i);
return sum;
}
-/*
- * Consumers of these two interfaces, like for example the cpuidle menu
- * governor, are using nonsensical data. Preferring shallow idle state selection
- * for a CPU that has IO-wait which might not even end up running the task when
- * it does become runnable.
- */
-
-unsigned long nr_iowait_cpu(int cpu)
-{
- struct rq *this = cpu_rq(cpu);
- return atomic_read(&this->nr_iowait);
-}
-
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
-{
- struct rq *rq = this_rq();
- *nr_waiters = atomic_read(&rq->nr_iowait);
- *load = rq->load.weight;
-}
-
#ifdef CONFIG_SMP
/*
@@ -3051,6 +3094,7 @@ void scheduler_tick(void)
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
+ psi_task_tick(rq);
rq_unlock(rq, &rf);
@@ -3418,7 +3462,7 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
- if (unlikely(signal_pending_state(prev->state, prev))) {
+ if (signal_pending_state(prev->state, prev)) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
@@ -4193,7 +4237,7 @@ recheck:
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (idle_policy(p->policy) && !idle_policy(policy)) {
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -4452,7 +4496,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
u32 size;
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
return -EFAULT;
/* Zero the full structure, so that a short copy will be nice: */
@@ -4652,7 +4696,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
{
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, usize))
+ if (!access_ok(uattr, usize))
return -EFAULT;
/*
@@ -4933,9 +4977,7 @@ static void do_sched_yield(void)
struct rq_flags rf;
struct rq *rq;
- local_irq_disable();
- rq = this_rq();
- rq_lock(rq, &rf);
+ rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
@@ -5256,9 +5298,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct old_timespec32 __user *, interval)
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
@@ -5742,15 +5783,10 @@ int sched_cpu_activate(unsigned int cpu)
#ifdef CONFIG_SCHED_SMT
/*
- * The sched_smt_present static key needs to be evaluated on every
- * hotplug event because at boot time SMT might be disabled when
- * the number of booted CPUs is limited.
- *
- * If then later a sibling gets hotplugged, then the key would stay
- * off and SMT scheduling would never be functional.
+ * When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
- static_branch_enable_cpuslocked(&sched_smt_present);
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
@@ -5792,7 +5828,15 @@ int sched_cpu_deactivate(unsigned int cpu)
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ synchronize_rcu();
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going down, decrement the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
if (!sched_smp_initialized)
return 0;
@@ -6069,6 +6113,8 @@ void __init sched_init(void)
init_schedstats();
+ psi_init();
+
scheduler_running = 1;
}
@@ -6145,6 +6191,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ