Merge tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: - Implement the SCHED_DEADLINE server infrastructure - Daniel Bristot de Oliveira's last major contribution to the kernel: "SCHED_DEADLINE servers can help fixing starvation issues of low priority tasks (e.g., SCHED_OTHER) when higher priority tasks monopolize CPU cycles. Today we have RT Throttling; DEADLINE servers should be able to replace and improve that." (Daniel Bristot de Oliveira, Peter Zijlstra, Joel Fernandes, Youssef Esmat, Huang Shijie) - Preparatory changes for sched_ext integration: - Use set_next_task(.first) where required - Fix up set_next_task() implementations - Clean up DL server vs. core sched - Split up put_prev_task_balance() - Rework pick_next_task() - Combine the last put_prev_task() and the first set_next_task() - Rework dl_server - Add put_prev_task(.next) (Peter Zijlstra, with a fix by Tejun Heo) - Complete the EEVDF transition and refine EEVDF scheduling: - Implement delayed dequeue - Allow shorter slices to wakeup-preempt - Use sched_attr::sched_runtime to set request/slice suggestion - Document the new feature flags - Remove unused and duplicate-functionality fields - Simplify & unify pick_next_task_fair() - Misc debuggability enhancements (Peter Zijlstra, with fixes/cleanups by Dietmar Eggemann, Valentin Schneider and Chuyi Zhou) - Initialize the vruntime of a new task when it is first enqueued, resulting in significant decrease in latency of newly woken tasks (Zhang Qiao) - Introduce SM_IDLE and an idle re-entry fast-path in __schedule() (K Prateek Nayak, Peter Zijlstra) - Clean up and clarify the usage of Clean up usage of rt_task() (Qais Yousef) - Preempt SCHED_IDLE entities in strict cgroup hierarchies (Tianchen Ding) - Clarify the documentation of time units for deadline scheduler parameters (Christian Loehle) - Remove the HZ_BW chicken-bit feature flag introduced a year ago, the original change seems to be working fine (Phil Auld) - Misc fixes and cleanups (Chen Yu, Dan Carpenter, Huang Shijie, Peilin He, Qais Yousefm and Vincent Guittot) * tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits) sched/cpufreq: Use NSEC_PER_MSEC for deadline task cpufreq/cppc: Use NSEC_PER_MSEC for deadline task sched/deadline: Clarify nanoseconds in uapi sched/deadline: Convert schedtool example to chrt sched/debug: Fix the runnable tasks output sched: Fix sched_delayed vs sched_core kernel/sched: Fix util_est accounting for DELAY_DEQUEUE kthread: Fix task state in kthread worker if being frozen sched/pelt: Use rq_clock_task() for hw_pressure sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule() sched: Add put_prev_task(.next) sched: Rework dl_server sched: Combine the last put_prev_task() and the first set_next_task() sched: Rework pick_next_task() sched: Split up put_prev_task_balance() sched: Clean up DL server vs core sched sched: Fixup set_next_task() implementations sched: Use set_next_task(.first) where required sched/fair: Properly deactivate sched_delayed task upon class change ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-09-19 15:55:58 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-09-19 15:55:58 +0200
commit: 2004cef11ea072838f99bd95cefa5c8e45df0847 (patch)
tree: d7162235ad3c3985abbb5233657eef0c03819b28 /kernel/sched/sched.h
parent: 509d2cd12a10d057fdf72f565b930f9a81140d59 (diff)
parent: bc9057da1a220ff2cb6c8885fd5352558aceba2c (diff)
download: linux-2004cef11ea072838f99bd95cefa5c8e45df0847.tar.gz
linux-2004cef11ea072838f99bd95cefa5c8e45df0847.tar.bz2
linux-2004cef11ea072838f99bd95cefa5c8e45df0847.zip
1 files changed, 78 insertions, 23 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c36cc680361..3744f16a1293 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -68,6 +68,7 @@
 #include <linux/wait_api.h>
 #include <linux/wait_bit.h>
 #include <linux/workqueue_api.h>
+#include <linux/delayacct.h>
 
 #include <trace/events/power.h>
 #include <trace/events/sched.h>
@@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
 extern int  dl_bw_check_overflow(int cpu);
-
+extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
 /*
  * SCHED_DEADLINE supports servers (nested scheduling) with the following
  * interface:
@@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se);
 extern void dl_server_stop(struct sched_dl_entity *dl_se);
 extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
 		    dl_server_has_tasks_f has_tasks,
-		    dl_server_pick_f pick);
+		    dl_server_pick_f pick_task);
+
+extern void dl_server_update_idle_time(struct rq *rq,
+		    struct task_struct *p);
+extern void fair_server_init(struct rq *rq);
+extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
+extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
+		    u64 runtime, u64 period, bool init);
 
 #ifdef CONFIG_CGROUP_SCHED
 
@@ -599,17 +607,12 @@ struct cfs_rq {
 	s64			avg_vruntime;
 	u64			avg_load;
 
-	u64			exec_clock;
 	u64			min_vruntime;
 #ifdef CONFIG_SCHED_CORE
 	unsigned int		forceidle_seq;
 	u64			min_vruntime_fi;
 #endif
 
-#ifndef CONFIG_64BIT
-	u64			min_vruntime_copy;
-#endif
-
 	struct rb_root_cached	tasks_timeline;
 
 	/*
@@ -619,10 +622,6 @@ struct cfs_rq {
 	struct sched_entity	*curr;
 	struct sched_entity	*next;
 
-#ifdef	CONFIG_SCHED_DEBUG
-	unsigned int		nr_spread_over;
-#endif
-
 #ifdef CONFIG_SMP
 	/*
 	 * CFS load tracking
@@ -726,13 +725,13 @@ struct rt_rq {
 #endif /* CONFIG_SMP */
 	int			rt_queued;
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	int			rt_throttled;
 	u64			rt_time;
 	u64			rt_runtime;
 	/* Nests inside the rq lock: */
 	raw_spinlock_t		rt_runtime_lock;
 
-#ifdef CONFIG_RT_GROUP_SCHED
 	unsigned int		rt_nr_boosted;
 
 	struct rq		*rq;
@@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se)
 
 static inline long se_runnable(struct sched_entity *se)
 {
+	if (se->sched_delayed)
+		return false;
+
 	if (entity_is_task(se))
 		return !!se->on_rq;
 	else
@@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { }
 
 static inline long se_runnable(struct sched_entity *se)
 {
+	if (se->sched_delayed)
+		return false;
+
 	return !!se->on_rq;
 }
 
@@ -1044,6 +1049,8 @@ struct rq {
 	struct rt_rq		rt;
 	struct dl_rq		dl;
 
+	struct sched_dl_entity	fair_server;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
 	struct list_head	leaf_cfs_rq_list;
@@ -1059,6 +1066,7 @@ struct rq {
 	unsigned int		nr_uninterruptible;
 
 	struct task_struct __rcu	*curr;
+	struct sched_dl_entity	*dl_server;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
@@ -1158,7 +1166,6 @@ struct rq {
 	/* latency stats */
 	struct sched_info	rq_sched_info;
 	unsigned long long	rq_cpu_time;
-	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
 	unsigned int		yld_count;
@@ -1187,6 +1194,7 @@ struct rq {
 	/* per rq */
 	struct rq		*core;
 	struct task_struct	*core_pick;
+	struct sched_dl_entity	*core_dl_server;
 	unsigned int		core_enabled;
 	unsigned int		core_sched_seq;
 	struct rb_root		core_tree;
@@ -2247,11 +2255,13 @@ extern const u32		sched_prio_to_wmult[40];
  *
  */
 
-#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SLEEP		0x01 /* Matches ENQUEUE_WAKEUP */
 #define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
 #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_SPECIAL		0x10
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2267,6 +2277,7 @@ extern const u32		sched_prio_to_wmult[40];
 #endif
 #define ENQUEUE_INITIAL		0x80
 #define ENQUEUE_MIGRATING	0x100
+#define ENQUEUE_DELAYED		0x200
 
 #define RETRY_TASK		((void *)-1UL)
 
@@ -2285,23 +2296,31 @@ struct sched_class {
 #endif
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+	bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
 	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
 
 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct *(*pick_next_task)(struct rq *rq);
+	struct task_struct *(*pick_task)(struct rq *rq);
+	/*
+	 * Optional! When implemented pick_next_task() should be equivalent to:
+	 *
+	 *   next = pick_task();
+	 *   if (next) {
+	 *       put_prev_task(prev);
+	 *       set_next_task_first(next);
+	 *   }
+	 */
+	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
 
-	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+	void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
 	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
 
 #ifdef CONFIG_SMP
 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
 
-	struct task_struct * (*pick_task)(struct rq *rq);
-
 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
 
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
@@ -2345,7 +2364,7 @@ struct sched_class {
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->curr != prev);
-	prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev, NULL);
 }
 
 static inline void set_next_task(struct rq *rq, struct task_struct *next)
@@ -2353,6 +2372,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
 	next->sched_class->set_next_task(rq, next, false);
 }
 
+static inline void
+__put_prev_set_next_dl_server(struct rq *rq,
+			      struct task_struct *prev,
+			      struct task_struct *next)
+{
+	prev->dl_server = NULL;
+	next->dl_server = rq->dl_server;
+	rq->dl_server = NULL;
+}
+
+static inline void put_prev_set_next_task(struct rq *rq,
+					  struct task_struct *prev,
+					  struct task_struct *next)
+{
+	WARN_ON_ONCE(rq->curr != prev);
+
+	__put_prev_set_next_dl_server(rq, prev, next);
+
+	if (next == prev)
+		return;
+
+	prev->sched_class->put_prev_task(rq, prev, next);
+	next->sched_class->set_next_task(rq, next, true);
+}
 
 /*
  * Helper to define a sched_class instance; each one is placed in a separate
@@ -2408,7 +2451,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
 }
 
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_next_task_idle(struct rq *rq);
+extern struct task_struct *pick_task_idle(struct rq *rq);
 
 #define SCA_CHECK		0x01
 #define SCA_MIGRATE_DISABLE	0x02
@@ -2515,7 +2558,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
-extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
@@ -2586,6 +2628,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 	sched_update_tick_dependency(rq);
 }
 
+static inline void __block_task(struct rq *rq, struct task_struct *p)
+{
+	WRITE_ONCE(p->on_rq, 0);
+	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+	if (p->sched_contributes_to_load)
+		rq->nr_uninterruptible++;
+
+	if (p->in_iowait) {
+		atomic_inc(&rq->nr_iowait);
+		delayacct_blkio_start();
+	}
+}
+
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
@@ -3607,7 +3662,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c
 extern void __setscheduler_prio(struct task_struct *p, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void check_class_changed(struct rq *rq, struct task_struct *p,
 				const struct sched_class *prev_class,
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-09-19 15:55:58 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-09-19 15:55:58 +0200
commit	2004cef11ea072838f99bd95cefa5c8e45df0847 (patch)
tree	d7162235ad3c3985abbb5233657eef0c03819b28 /kernel/sched/sched.h
parent	509d2cd12a10d057fdf72f565b930f9a81140d59 (diff)
parent	bc9057da1a220ff2cb6c8885fd5352558aceba2c (diff)
download	linux-2004cef11ea072838f99bd95cefa5c8e45df0847.tar.gz linux-2004cef11ea072838f99bd95cefa5c8e45df0847.tar.bz2 linux-2004cef11ea072838f99bd95cefa5c8e45df0847.zip