aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c894
1 files changed, 631 insertions, 263 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..ea74d43924b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -38,7 +38,7 @@
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
@@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
@@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
*
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -94,6 +94,14 @@ int __weak arch_asym_cpu_priority(int cpu)
{
return -cpu;
}
+
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+static unsigned int capacity_margin = 1280;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -110,14 +118,6 @@ int __weak arch_asym_cpu_priority(int cpu)
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
-/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
- *
- * (default: ~20%)
- */
-unsigned int capacity_margin = 1280;
-
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -282,76 +275,99 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->on_list) {
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;
+
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases and a special case for the root
- * cfs_rq. Furthermore, it also means that we will always reset
- * tmp_alone_branch either when the branch is connected
- * to a tree or when we reach the beg of the tree
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
*/
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- /*
- * If parent is already on the list, we add the child
- * just before. Thanks to circular linked property of
- * the list, this means to put the child at the tail
- * of the list that starts by parent.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- /*
- * The branch is now connected to its tree so we can
- * reset tmp_alone_branch to the beginning of the
- * list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else if (!cfs_rq->tg->parent) {
- /*
- * cfs rq without parent should be put
- * at the tail of the list.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq->leaf_cfs_rq_list);
- /*
- * We have reach the beg of a tree so we can reset
- * tmp_alone_branch to the beginning of the list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else {
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the beg of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new beg
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- }
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
- cfs_rq->on_list = 1;
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
}
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ /*
+ * With cfs_rq being unthrottled/throttled during an enqueue,
+ * it can happen the tmp_alone_branch points the a leaf that
+ * we finally want to del. In this case, tmp_alone_branch moves
+ * to the prev element but it will point to rq->leaf_cfs_rq_list
+ * at the end of the enqueue.
+ */
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
@@ -411,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
return container_of(se, struct task_struct, se);
}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return container_of(cfs_rq, struct rq, cfs);
-}
-
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -439,14 +449,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
+ return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
@@ -687,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-#ifdef CONFIG_SMP
#include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -703,9 +717,9 @@ void init_entity_runnable_average(struct sched_entity *se)
memset(sa, 0, sizeof(*sa));
/*
- * Tasks are intialized with full load to be seen as heavy tasks until
+ * Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
- * Group entities are intialized with zero load to reflect the fact that
+ * Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
@@ -745,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
@@ -764,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
}
- if (entity_is_task(se)) {
- struct task_struct *p = task_of(se);
- if (p->sched_class != &fair_sched_class) {
- /*
- * For !fair tasks do:
- *
- update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
- switched_from_fair(rq, p);
- *
- * such that the next switched_to_fair() has the
- * expected state.
- */
- se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
- return;
- }
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq);
+ attach_entity_load_avg(cfs_rq, se, 0);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+ return;
}
attach_entity_cfs_rq(se);
@@ -789,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
void init_entity_runnable_average(struct sched_entity *se)
{
}
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -1036,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
- atomic_t refcount;
+ refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
@@ -1105,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
@@ -1128,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
@@ -1161,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1181,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1401,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1849,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -2096,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2204,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)
static inline int get_numa_group(struct numa_group *grp)
{
- return atomic_inc_not_zero(&grp->refcount);
+ return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
- if (atomic_dec_and_test(&grp->refcount))
+ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
@@ -2230,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
return;
- atomic_set(&grp->refcount, 1);
+ refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
@@ -2400,8 +2412,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
local = 1;
/*
- * Retry task to preferred node migration periodically, in case it
- * case it previously failed, or the scheduler moved us.
+ * Retry to migrate task to preferred node periodically, in case it
+ * previously failed, or the scheduler moved us.
*/
if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
@@ -2639,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
@@ -2734,6 +2747,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
WRITE_ONCE(*ptr, res); \
} while (0)
+/*
+ * Remove and clamp on negative, from a local variable.
+ *
+ * A variant of sub_positive(), which does not use explicit load-store
+ * and is thus optimized for local variable updates.
+ */
+#define lsub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
+} while (0)
+
#ifdef CONFIG_SMP
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3112,7 +3136,7 @@ void set_task_rq_fair(struct sched_entity *se,
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ __update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3247,11 +3271,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
- * As running sum is scale with CPU capacity wehreas the runnable sum
- * is not we rescale running_sum 1st
+ * Rescale running sum to be in the same range as runnable sum
+ * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
+ * runnable_sum is in [0 : LOAD_AVG_MAX]
*/
- running_sum = se->avg.util_sum /
- arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3354,7 +3378,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3399,7 +3423,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
decayed = 1;
}
- decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3489,9 +3513,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
/*
@@ -3499,7 +3521,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
+ __update_load_avg_se(now, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -3551,7 +3573,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+ __update_load_avg_blocked_se(last_update_time, se);
}
/*
@@ -3567,10 +3589,6 @@ void remove_entity_load_avg(struct sched_entity *se)
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
- *
- * Similarly for groups, they will have passed through
- * post_init_entity_util_avg() before unregister_sched_fair_group()
- * calls this.
*/
sync_entity_load_avg(se);
@@ -3604,7 +3622,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return max(ue.ewma, ue.enqueued);
+ return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -3622,7 +3640,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
- enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
}
@@ -3644,14 +3662,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
+ int cpu;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
@@ -3679,6 +3697,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * To avoid overestimation of actual task utilization, skip updates if
+ * we cannot grant there is idle time in this CPU.
+ */
+ cpu = cpu_of(rq_of(cfs_rq));
+ if (task_util(p) > capacity_orig_of(cpu))
+ return;
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
@@ -3966,8 +3992,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Substract its load from the cfs_rq->runnable_avg.
- * - Substract its previous weight from cfs_rq->load.weight.
+ * - Subtract its load from the cfs_rq->runnable_avg.
+ * - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
@@ -4208,7 +4234,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
#ifdef CONFIG_CFS_BANDWIDTH
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
@@ -4225,7 +4251,7 @@ void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* HAVE_JUMP_LABEL */
+#else /* CONFIG_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -4233,7 +4259,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* HAVE_JUMP_LABEL */
+#endif /* CONFIG_JUMP_LABEL */
/*
* default period for cfs group bandwidth.
@@ -4420,6 +4446,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
+
+ /* Add cfs_rq with already running entity in the list */
+ if (cfs_rq->nr_running >= 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
return 0;
@@ -4431,8 +4461,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count)
+ if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
cfs_rq->throttle_count++;
return 0;
@@ -4535,6 +4567,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
+ assert_list_leaf_cfs_rq(rq);
+
if (!se)
add_nr_running(rq, task_delta);
@@ -4556,7 +4590,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4573,7 +4607,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
@@ -4589,7 +4623,7 @@ next:
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime, runtime_expires;
int throttled;
@@ -4631,16 +4665,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
}
/*
@@ -4744,17 +4778,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ unsigned long flags;
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (cfs_b->distribute_running) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
@@ -4765,18 +4800,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (expires == cfs_b->runtime_expires)
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@@ -4854,20 +4889,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ unsigned long flags;
int overrun;
int idle = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -4977,6 +5013,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
@@ -5072,6 +5114,24 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long cpu_util(int cpu);
+static unsigned long capacity_of(int cpu);
+
+static inline bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+}
+#else
+static inline void update_overutilized_status(struct rq *rq) { }
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5129,8 +5189,43 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (flags & ENQUEUE_WAKEUP)
+ update_overutilized_status(rq);
+
+ }
+
+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
+ assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
}
@@ -5511,11 +5606,6 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -5674,11 +5764,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
{
- return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+ return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
}
/*
@@ -5738,7 +5828,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
- spare_cap = capacity_spare_wake(i, p);
+ spare_cap = capacity_spare_without(i, p);
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
@@ -5889,8 +5979,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
- * last_update_time.
+ * We need task's util for capacity_spare_without, sync it up to
+ * prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
@@ -5935,6 +6025,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
@@ -6007,7 +6098,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- cpumask_clear_cpu(cpu, cpus);
+ __cpumask_clear_cpu(cpu, cpus);
if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6027,7 +6118,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
@@ -6051,7 +6142,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6156,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, sd, target);
+ i = select_idle_smt(p, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6216,10 +6307,19 @@ static inline unsigned long cpu_util(int cpu)
}
/*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
*/
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int util;
@@ -6231,8 +6331,8 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
- /* Discount task's blocked util from CPU's util */
- util -= min_t(unsigned int, util, task_util(p));
+ /* Discount task's util from CPU's util */
+ lsub_positive(&util, task_util(p));
/*
* Covered cases:
@@ -6240,14 +6340,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
- * cpu_util_wake = (cpu_util - task_util) = 0
+ * cpu_util_without = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
- * cpu_util_wake = (cpu_util - task_util) >= 0
+ * cpu_util_without = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
@@ -6260,8 +6360,32 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
* covered by the following code when estimated utilization is
* enabled.
*/
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+ if (sched_feat(UTIL_EST)) {
+ unsigned int estimated =
+ READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * Despite the following checks we still have a small window
+ * for a possible race, when an execl's select_task_rq_fair()
+ * races with LB's detach_task():
+ *
+ * detach_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * ---------------------------------- A
+ * deactivate_task() \
+ * dequeue_task() + RaceTime
+ * util_est_dequeue() /
+ * ---------------------------------- B
+ *
+ * The additional check on "current == p" it's required to
+ * properly fix the execl regression and it helps in further
+ * reducing the chances for the above race.
+ */
+ if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&estimated, _task_util_est(p));
+
+ util = max(util, estimated);
+ }
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
@@ -6299,6 +6423,213 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
/*
+ * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+ * to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /*
+ * If @p migrates from @cpu to another, remove its contribution. Or,
+ * if @p migrates from another CPU to @cpu, add its contribution. In
+ * the other cases, @cpu is not impacted by the migration, so the
+ * util_avg should already be correct.
+ */
+ if (task_cpu(p) == cpu && dst_cpu != cpu)
+ sub_positive(&util, task_util(p));
+ else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
+
+ if (sched_feat(UTIL_EST)) {
+ util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * During wake-up, the task isn't enqueued yet and doesn't
+ * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+ * so just add it (if needed) to "simulate" what will be
+ * cpu_util() after the task has been enqueued.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+
+ util = max(util, util_est);
+ }
+
+ return min(util, capacity_orig_of(cpu));
+}
+
+/*
+ * compute_energy(): Estimates the energy that would be consumed if @p was
+ * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+ * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * to compute what would be the energy if we decided to actually migrate that
+ * task.
+ */
+static long
+compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+ long util, max_util, sum_util, energy = 0;
+ int cpu;
+
+ for (; pd; pd = pd->next) {
+ max_util = sum_util = 0;
+ /*
+ * The capacity state of CPUs of the current rd can be driven by
+ * CPUs of another rd if they belong to the same performance
+ * domain. So, account for the utilization of these CPUs too
+ * by masking pd with cpu_online_mask instead of the rd span.
+ *
+ * If an entire performance domain is outside of the current rd,
+ * it will not appear in its pd list and will not be accounted
+ * by compute_energy().
+ */
+ for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+ util = cpu_util_next(cpu, p, dst_cpu);
+ util = schedutil_energy_util(cpu, util);
+ max_util = max(util, max_util);
+ sum_util += util;
+ }
+
+ energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ }
+
+ return energy;
+}
+
+/*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+{
+ unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int cpu, best_energy_cpu = prev_cpu;
+ struct perf_domain *head, *pd;
+ unsigned long cpu_cap, util;
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ pd = rcu_dereference(rd->pd);
+ if (!pd || READ_ONCE(rd->overutilized))
+ goto fail;
+ head = pd;
+
+ /*
+ * Energy-aware wake-up happens on the lowest sched_domain starting
+ * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+ */
+ sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ sd = sd->parent;
+ if (!sd)
+ goto fail;
+
+ sync_entity_load_avg(&p->se);
+ if (!task_util_est(p))
+ goto unlock;
+
+ for (; pd; pd = pd->next) {
+ unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ int max_spare_cap_cpu = -1;
+
+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
+
+ /* Skip CPUs that will be overutilized. */
+ util = cpu_util_next(cpu, p, cpu);
+ cpu_cap = capacity_of(cpu);
+ if (cpu_cap * 1024 < util * capacity_margin)
+ continue;
+
+ /* Always use prev_cpu as a candidate. */
+ if (cpu == prev_cpu) {
+ prev_energy = compute_energy(p, prev_cpu, head);
+ best_energy = min(best_energy, prev_energy);
+ continue;
+ }
+
+ /*
+ * Find the CPU with the maximum spare capacity in
+ * the performance domain
+ */
+ spare_cap = cpu_cap - util;
+ if (spare_cap > max_spare_cap) {
+ max_spare_cap = spare_cap;
+ max_spare_cap_cpu = cpu;
+ }
+ }
+
+ /* Evaluate the energy impact of using this CPU. */
+ if (max_spare_cap_cpu >= 0) {
+ cur_energy = compute_energy(p, max_spare_cap_cpu, head);
+ if (cur_energy < best_energy) {
+ best_energy = cur_energy;
+ best_energy_cpu = max_spare_cap_cpu;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ /*
+ * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+ * least 6% of the energy used by prev_cpu.
+ */
+ if (prev_energy == ULONG_MAX)
+ return best_energy_cpu;
+
+ if ((prev_energy - best_energy) > (prev_energy >> 4))
+ return best_energy_cpu;
+
+ return prev_cpu;
+
+fail:
+ rcu_read_unlock();
+
+ return -1;
+}
+
+/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
@@ -6321,8 +6652,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, &p->cpus_allowed);
+
+ if (sched_energy_enabled()) {
+ new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+ if (new_cpu >= 0)
+ return new_cpu;
+ new_cpu = prev_cpu;
+ }
+
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
}
rcu_read_lock();
@@ -6486,7 +6825,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
@@ -6498,7 +6837,7 @@ static void set_last_buddy(struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
@@ -6556,8 +6895,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(curr->policy == SCHED_IDLE) &&
- likely(p->policy != SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(curr)) &&
+ likely(!task_has_idle_policy(p)))
goto preempt;
/*
@@ -6733,6 +7072,12 @@ idle:
if (new_tasks > 0)
goto again;
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
+
return NULL;
}
@@ -6978,7 +7323,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (p->sched_class != &fair_sched_class)
return 0;
- if (unlikely(p->policy == SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(p)))
return 0;
/*
@@ -7388,11 +7733,7 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -7413,8 +7754,8 @@ static void update_blocked_averages(int cpu)
}
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7484,11 +7825,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7862,16 +8203,16 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate pullable load (e.g. >1 runnable task).
+ * @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
- struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ int *sg_status)
{
+ int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+ int load_idx = get_sd_load_idx(env->sd, env->idle);
unsigned long load;
int i, nr_running;
@@ -7895,7 +8236,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
if (nr_running > 1)
- *overload = true;
+ *sg_status |= SG_OVERLOAD;
+
+ if (cpu_overutilized(i))
+ *sg_status |= SG_OVERUTILIZED;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -7911,7 +8255,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *overload = 1;
+ *sg_status |= SG_OVERLOAD;
}
}
@@ -8056,17 +8400,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx;
- bool overload = false;
bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
env->flags |= LBF_NOHZ_STATS;
#endif
- load_idx = get_sd_load_idx(env->sd, env->idle);
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8081,8 +8422,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ update_sg_lb_stats(env, sg, sgs, &sg_status);
if (local_group)
goto next_group;
@@ -8131,9 +8471,15 @@ next_group:
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
+ struct root_domain *rd = env->dst_rq->rd;
+
/* update overload indicator if we are at root domain */
- if (READ_ONCE(env->dst_rq->rd->overload) != overload)
- WRITE_ONCE(env->dst_rq->rd->overload, overload);
+ WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+ } else if (sg_status & SG_OVERUTILIZED) {
+ WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
}
}
@@ -8177,9 +8523,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
- env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
- SCHED_CAPACITY_SCALE);
+ env->imbalance = sds->busiest_stat.group_load;
return 1;
}
@@ -8360,6 +8704,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -8544,21 +8896,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-static int need_active_balance(struct lb_env *env)
+static inline bool
+asym_active_balance(struct lb_env *env)
{
- struct sched_domain *sd = env->sd;
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
+ */
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
+}
- if (env->idle == CPU_NEWLY_IDLE) {
+static inline bool
+voluntary_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
- /*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
- */
- if ((sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
- return 1;
- }
+ if (asym_active_balance(env))
+ return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8576,6 +8932,16 @@ static int need_active_balance(struct lb_env *env)
if (env->src_grp_type == group_misfit_task)
return 1;
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (voluntary_active_balance(env))
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -8740,7 +9106,7 @@ more_balance:
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
@@ -8767,7 +9133,7 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
@@ -8837,7 +9203,7 @@ more_balance:
} else
sd->nr_balance_failed = 0;
- if (likely(!active_balance)) {
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
@@ -8876,13 +9242,22 @@ out_all_pinned:
sd->nr_balance_failed = 0;
out_one_pinned:
+ ld_moved = 0;
+
+ /*
+ * idle_balance() disregards balance intervals, so we could repeatedly
+ * reach this code, which would lead to balance_interval skyrocketting
+ * in a short amount of time. Skip the balance_interval increase logic
+ * to avoid that.
+ */
+ if (env.idle == CPU_NEWLY_IDLE)
+ goto out;
+
/* tune up the balancing interval */
- if (((env.flags & LBF_ALL_PINNED) &&
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
- (sd->balance_interval < sd->max_interval))
+ if ((env.flags & LBF_ALL_PINNED &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
-
- ld_moved = 0;
out:
return ld_moved;
}
@@ -9177,15 +9552,8 @@ static void kick_ilb(unsigned int flags)
}
/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- * - This rq has more than one task.
- * - This rq has at least one CFS task and the capacity of the CPU is
- * significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
- * multiple busy cpu.
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- * domain span are idle.
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
*/
static void nohz_balancer_kick(struct rq *rq)
{
@@ -9227,8 +9595,13 @@ static void nohz_balancer_kick(struct rq *rq)
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
- * XXX: write a coherent comment on why we do this.
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ * If there is an imbalance between LLC domains (IOW we could
+ * increase the overall cache use), we need some less-loaded LLC
+ * domain to pull some load. Likewise, we may need to spread
+ * load within the current LLC domain (e.g. packed SMT cores but
+ * other CPUs are idle). We can't really know from here how busy
+ * the others are - so just get a nohz balance going if it looks
+ * like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
@@ -9241,19 +9614,15 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(rq->sd);
if (sd) {
if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
+ check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (i == cpu ||
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
- continue;
-
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
@@ -9499,9 +9868,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return false;
}
- /*
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
- */
+ /* could be _relaxed() */
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
if (!(flags & NOHZ_KICK_MASK))
return false;
@@ -9751,6 +10118,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
+ update_overutilized_status(task_rq(curr));
}
/*