aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/psi.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r--kernel/sched/psi.c253
1 files changed, 129 insertions, 124 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..a4fa3aadfcba 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
@@ -34,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
* SOME = nr_delayed_tasks != 0
- * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
@@ -81,13 +88,13 @@
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
- * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
- * FULL = (256 - min(257, 256)) / 256 = 0%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@@ -112,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@@ -130,21 +137,6 @@
* sampling of the aggregate task states would be.
*/
-#include "../workqueue_internal.h"
-#include <linux/sched/loadavg.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/seqlock.h>
-#include <linux/uaccess.h>
-#include <linux/cgroup.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/ctype.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/psi.h>
-#include "sched.h"
-
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
@@ -233,7 +225,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+ return unlikely(tasks[NR_MEMSTALL] &&
+ tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
@@ -515,7 +508,7 @@ static void init_triggers(struct psi_group *group, u64 now)
static u64 update_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
- bool new_stall = false;
+ bool update_total = false;
u64 *total = group->total[PSI_POLL];
/*
@@ -524,24 +517,35 @@ static u64 update_triggers(struct psi_group *group, u64 now)
*/
list_for_each_entry(t, &group->triggers, node) {
u64 growth;
+ bool new_stall;
- /* Check for stall activity */
- if (group->polling_total[t->state] == total[t->state])
- continue;
+ new_stall = group->polling_total[t->state] != total[t->state];
+ /* Check for stall activity or a previous threshold breach */
+ if (!new_stall && !t->pending_event)
+ continue;
/*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
+ * Check for new stall activity, as well as deferred
+ * events that occurred in the last window after the
+ * trigger had already fired (we want to ratelimit
+ * events without dropping any).
*/
- new_stall = true;
-
- /* Calculate growth since last update */
- growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
-
+ if (new_stall) {
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ update_total = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ t->pending_event = true;
+ }
/* Limit event signaling to once per window */
if (now < t->last_event_time + t->win.size)
continue;
@@ -550,9 +554,11 @@ static u64 update_triggers(struct psi_group *group, u64 now)
if (cmpxchg(&t->event, 0, 1) == 0)
wake_up_interruptible(&t->event_wait);
t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->pending_event = false;
}
- if (new_stall)
+ if (update_total)
memcpy(group->polling_total, total,
sizeof(group->polling_total));
@@ -710,10 +716,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], clear, set);
+ groupc->tasks[3], groupc->tasks[4],
+ clear, set);
psi_bug = 1;
}
}
@@ -833,7 +840,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
- * runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
*/
@@ -854,12 +860,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0;
/*
- * When we're going to sleep, psi_dequeue() lets us handle
- * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
- * with TSK_ONCPU and save walking common ancestors twice.
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
}
@@ -908,7 +917,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
@@ -937,7 +946,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}
@@ -1071,44 +1080,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}
-static int psi_io_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_IO);
-}
-
-static int psi_memory_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_MEM);
-}
-
-static int psi_cpu_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_CPU);
-}
-
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
- if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- return single_open(file, psi_show, NULL);
-}
-
-static int psi_io_open(struct inode *inode, struct file *file)
-{
- return psi_open(file, psi_io_show);
-}
-
-static int psi_memory_open(struct inode *inode, struct file *file)
-{
- return psi_open(file, psi_memory_show);
-}
-
-static int psi_cpu_open(struct inode *inode, struct file *file)
-{
- return psi_open(file, psi_cpu_show);
-}
-
struct psi_trigger *psi_trigger_create(struct psi_group *group,
char *buf, size_t nbytes, enum psi_res res)
{
@@ -1151,7 +1122,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
+ t->pending_event = false;
mutex_lock(&group->trigger_lock);
@@ -1180,15 +1151,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
+ struct psi_group *group;
struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
@@ -1224,9 +1199,9 @@ static void psi_trigger_destroy(struct kref *ref)
mutex_unlock(&group->trigger_lock);
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_task RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_task
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * poll_task.
*/
synchronize_rcu();
/*
@@ -1243,18 +1218,6 @@ static void psi_trigger_destroy(struct kref *ref)
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1264,27 +1227,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
-
- rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
+{
+ if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ return single_open(file, psi_show, NULL);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_io_show);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_memory_show);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_cpu_show);
+}
+
static ssize_t psi_write(struct file *file, const char __user *user_buf,
size_t nbytes, enum psi_res res)
{
@@ -1305,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1347,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
@@ -1389,3 +1392,5 @@ static int __init psi_proc_init(void)
return 0;
}
module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */