diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 13:33:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 13:33:57 -0700 |
commit | 16b3d0cf5bad844daaf436ad2e9061de0fe36e5c (patch) | |
tree | d553a51e6d95fb166df7fa62264e9a27e4c438a4 /kernel/sched/psi.c | |
parent | 42dec9a936e7696bea1f27d3c5a0068cd9aa95fd (diff) | |
parent | 2ea46c6fc9452ac100ad907b051d797225847e33 (diff) | |
download | linux-16b3d0cf5bad844daaf436ad2e9061de0fe36e5c.tar.gz linux-16b3d0cf5bad844daaf436ad2e9061de0fe36e5c.tar.bz2 linux-16b3d0cf5bad844daaf436ad2e9061de0fe36e5c.zip |
Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and
debugfs interfaces to a unified debugfs interface.
- Signals: Allow caching one sigqueue object per task, to improve
performance & latencies.
- Improve newidle_balance() irq-off latencies on systems with a large
number of CPU cgroups.
- Improve energy-aware scheduling
- Improve the PELT metrics for certain workloads
- Reintroduce select_idle_smt() to improve load-balancing locality -
but without the previous regressions
- Add 'scheduler latency debugging': warn after long periods of pending
need_resched. This is an opt-in feature that requires the enabling of
the LATENCY_WARN scheduler feature, or the use of the
resched_latency_warn_ms=xx boot parameter.
- CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix
remaining balance_push() vs. hotplug holes/races
- PSI fixes, plus allow /proc/pressure/ files to be written by
CAP_SYS_RESOURCE tasks as well
- Fix/improve various load-balancing corner cases vs. capacity margins
- Fix sched topology on systems with NUMA diameter of 3 or above
- Fix PF_KTHREAD vs to_kthread() race
- Minor rseq optimizations
- Misc cleanups, optimizations, fixes and smaller updates
* tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits)
cpumask/hotplug: Fix cpu_dying() state tracking
kthread: Fix PF_KTHREAD vs to_kthread() race
sched/debug: Fix cgroup_path[] serialization
sched,psi: Handle potential task count underflow bugs more gracefully
sched: Warn on long periods of pending need_resched
sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning
sched/debug: Rename the sched_debug parameter to sched_verbose
sched,fair: Alternative sched_slice()
sched: Move /proc/sched_debug to debugfs
sched,debug: Convert sysctl sched_domains to debugfs
debugfs: Implement debugfs_create_str()
sched,preempt: Move preempt_dynamic to debug.c
sched: Move SCHED_DEBUG sysctl to debugfs
sched: Don't make LATENCYTOP select SCHED_DEBUG
sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG
sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG
sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
cpumask: Introduce DYING mask
cpumask: Make cpu_{online,possible,present,active}() inline
rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs()
...
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r-- | kernel/sched/psi.c | 164 |
1 files changed, 90 insertions, 74 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c0766c..db27b69fa92a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,7 +34,10 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level, means all non-idle tasks + * in a cgroup are delayed on the CPU resource which used by others outside + * of the cgroup or throttled by the cgroup cpu.max configuration. * * SOME = nr_delayed_tasks != 0 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 @@ -59,7 +62,7 @@ * states, we would have to conclude a CPU SOME pressure number of * 100%, since *somebody* is waiting on a runqueue at all * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution * threads will be contended at any given time, or about 0.4%. * * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -73,7 +76,7 @@ * we have to base our calculation on the number of non-idle tasks in * conjunction with the number of available CPUs, which is the number * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible * threads that are unproductive due to delays: * * threads = min(nr_nonidle_tasks, nr_cpus) @@ -216,15 +219,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) { switch (state) { case PSI_IO_SOME: - return tasks[NR_IOWAIT]; + return unlikely(tasks[NR_IOWAIT]); case PSI_IO_FULL: - return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); case PSI_MEM_SOME: - return tasks[NR_MEMSTALL]; + return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); case PSI_CPU_SOME: - return tasks[NR_RUNNING] > tasks[NR_ONCPU]; + return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + case PSI_CPU_FULL: + return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -441,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } -/* Trigger tracking window manupulations */ +/* Trigger tracking window manipulations */ static void window_reset(struct psi_window *win, u64 now, u64 value, u64 prev_growth) { @@ -639,13 +644,10 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(&group->poll_wait); } -static void record_times(struct psi_group_cpu *groupc, int cpu, - bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now; - now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now; @@ -659,34 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* - * Since we care about lost potential, a - * memstall is FULL when there are no other - * working tasks, but also when the CPU is - * actively reclaiming and nothing productive - * could run even if it were runnable. - * - * When the timer tick sees a reclaiming CPU, - * regardless of runnable tasks, sample a FULL - * tick (or less if it hasn't been a full tick - * since the last state change). - */ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } - if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, + unsigned int clear, unsigned int set, u64 now, bool wake_clock) { struct psi_group_cpu *groupc; @@ -706,19 +694,20 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, false); + record_times(groupc, now); for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) @@ -730,6 +719,18 @@ static void psi_group_change(struct psi_group *group, int cpu, if (test_state(groupc->tasks, s)) state_mask |= (1 << s); } + + /* + * Since we care about lost potential, a memstall is FULL + * when there are no other working tasks, but also when + * the CPU is actively reclaiming and nothing productive + * could run even if it were runnable. So when the current + * task in a cgroup is in_memstall, the corresponding groupc + * on that cpu is in PSI_MEM_FULL state. + */ + if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + state_mask |= (1 << PSI_MEM_FULL); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); @@ -786,12 +787,14 @@ void psi_task_change(struct task_struct *task, int clear, int set) struct psi_group *group; bool wake_clock = true; void *iter = NULL; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); + now = cpu_clock(cpu); /* * Periodic aggregation shuts off if there is a period of no * task changes, so we wake it back up if necessary. However, @@ -804,7 +807,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false; while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, wake_clock); + psi_group_change(group, cpu, clear, set, now, wake_clock); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -813,56 +816,61 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); void *iter; + u64 now = cpu_clock(cpu); if (next->pid) { + bool identical_state; + psi_flags_change(next, 0, TSK_ONCPU); /* - * When moving state between tasks, the group that - * contains them both does not change: we can stop - * updating the tree once we reach the first common - * ancestor. Iterate @next's ancestors until we - * encounter @prev's state. + * When switching between tasks that have an identical + * runtime state, the cgroup that contains both tasks + * runtime state, the cgroup that contains both tasks + * we reach the first common ancestor. Iterate @next's + * ancestors only until we encounter @prev's ONCPU. */ + identical_state = prev->psi_flags == next->psi_flags; iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (identical_state && + per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { common = group; break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); } } - /* - * If this is a voluntary sleep, dequeue will have taken care - * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We - * only need to deal with it during preemption. - */ - if (sleep) - return; - if (prev->pid) { - psi_flags_change(prev, TSK_ONCPU, 0); + int clear = TSK_ONCPU, set = 0; - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, TSK_ONCPU, 0, true); - } -} + /* + * When we're going to sleep, psi_dequeue() lets us handle + * TSK_RUNNING and TSK_IOWAIT here, where we can combine it + * with TSK_ONCPU and save walking common ancestors twice. + */ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + } -void psi_memstall_tick(struct task_struct *task, int cpu) -{ - struct psi_group *group; - void *iter = NULL; + psi_flags_change(prev, clear, set); - while ((group = iterate_groups(task, &iter))) { - struct psi_group_cpu *groupc; + iter = NULL; + while ((group = iterate_groups(prev, &iter)) && group != common) + psi_group_change(group, cpu, clear, set, now, true); - groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + /* + * TSK_ONCPU is handled up to the common ancestor. If we're tasked + * with dequeuing too, finish that for the rest of the hierarchy. + */ + if (sleep) { + clear &= ~TSK_ONCPU; + for (; group; group = iterate_groups(prev, &iter)) + psi_group_change(group, cpu, clear, set, now, true); + } } } @@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2 - (res == PSI_CPU); full++) { + for (full = 0; full < 2; full++) { unsigned long avg[3]; u64 total; int w; @@ -1054,19 +1062,27 @@ static int psi_cpu_show(struct seq_file *m, void *v) return psi_show(m, &psi_system, PSI_CPU); } +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + static int psi_io_open(struct inode *inode, struct file *file) { - return single_open(file, psi_io_show, NULL); + return psi_open(file, psi_io_show); } static int psi_memory_open(struct inode *inode, struct file *file) { - return single_open(file, psi_memory_show, NULL); + return psi_open(file, psi_memory_show); } static int psi_cpu_open(struct inode *inode, struct file *file) { - return single_open(file, psi_cpu_show, NULL); + return psi_open(file, psi_cpu_show); } struct psi_trigger *psi_trigger_create(struct psi_group *group, @@ -1346,9 +1362,9 @@ static int __init psi_proc_init(void) { if (psi_enable) { proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); - proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); + proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); } return 0; } |