diff options
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r-- | kernel/sched/psi.c | 253 |
1 files changed, 129 insertions, 124 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1652f2bb54b7..a4fa3aadfcba 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pressure stall information for CPU, memory and IO * @@ -34,13 +35,19 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * Naturally, the FULL state doesn't exist for the CPU resource at the - * system level, but exist at the cgroup level, means all non-idle tasks - * in a cgroup are delayed on the CPU resource which used by others outside - * of the cgroup or throttled by the cgroup cpu.max configuration. - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an oncpu task. + * + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. * * The percentage of wallclock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, @@ -81,13 +88,13 @@ * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -112,7 +119,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -130,21 +137,6 @@ * sampling of the aggregate task states would be. */ -#include "../workqueue_internal.h" -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/uaccess.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/ctype.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/psi.h> -#include "sched.h" - static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -233,7 +225,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_SOME: return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); case PSI_CPU_FULL: @@ -515,7 +508,7 @@ static void init_triggers(struct psi_group *group, u64 now) static u64 update_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - bool new_stall = false; + bool update_total = false; u64 *total = group->total[PSI_POLL]; /* @@ -524,24 +517,35 @@ static u64 update_triggers(struct psi_group *group, u64 now) */ list_for_each_entry(t, &group->triggers, node) { u64 growth; + bool new_stall; - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; + new_stall = group->polling_total[t->state] != total[t->state]; + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - + if (new_stall) { + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + update_total = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + t->pending_event = true; + } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) continue; @@ -550,9 +554,11 @@ static u64 update_triggers(struct psi_group *group, u64 now) if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; } - if (new_stall) + if (update_total) memcpy(group->polling_total, total, sizeof(group->polling_total)); @@ -710,10 +716,11 @@ static void psi_group_change(struct psi_group *group, int cpu, if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], clear, set); + groupc->tasks[3], groupc->tasks[4], + clear, set); psi_bug = 1; } } @@ -833,7 +840,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, /* * When switching between tasks that have an identical * runtime state, the cgroup that contains both tasks - * runtime state, the cgroup that contains both tasks * we reach the first common ancestor. Iterate @next's * ancestors only until we encounter @prev's ONCPU. */ @@ -854,12 +860,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int clear = TSK_ONCPU, set = 0; /* - * When we're going to sleep, psi_dequeue() lets us handle - * TSK_RUNNING and TSK_IOWAIT here, where we can combine it - * with TSK_ONCPU and save walking common ancestors twice. + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. */ if (sleep) { clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; } @@ -908,7 +917,7 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } @@ -937,7 +946,7 @@ void psi_memstall_leave(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 0; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } @@ -1071,44 +1080,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1151,7 +1122,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); + t->pending_event = false; mutex_lock(&group->trigger_lock); @@ -1180,15 +1151,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1224,9 +1199,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1243,18 +1218,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1264,27 +1227,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1305,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1347,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1389,3 +1392,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ |