diff options
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/Kconfig | 9 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 13 | ||||
-rw-r--r-- | kernel/time/namespace.c | 22 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 216 | ||||
-rw-r--r-- | kernel/time/sched_clock.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 21 | ||||
-rw-r--r-- | kernel/time/timekeeping_internal.h | 11 | ||||
-rw-r--r-- | kernel/time/timer.c | 254 | ||||
-rw-r--r-- | kernel/time/vsyscall.c | 41 |
10 files changed, 390 insertions, 201 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2ffb466af77e..ca223a89530a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. + * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..c4038511d5c9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5d9fc22d836a..afc65e6be33e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); - int err; if (!current_is_single_threaded()) return -EUSERS; @@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - timens_set_vvar_page(current, ns); - - err = vdso_join_timens(current, ns); - if (err) - return err; - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); - int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; - timens_set_vvar_page(tsk, ns); - - err = vdso_join_timens(tsk, ns); - if (err) - return err; - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; + timens_commit(tsk, ns); + return 0; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + /* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. */ -void run_posix_cpu_timers(void) +void __init posix_cputimers_init_work(void) { - struct task_struct *tsk = current; - struct k_itimer *timer, *next; - unsigned long flags; - LIST_HEAD(firing); + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} - lockdep_assert_irqs_disabled(); +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; } + /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. */ - check_thread_timers(tsk, &firing); + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) +{ + struct k_itimer *timer, *next; + unsigned long flags, start; + LIST_HEAD(firing); + + if (!lock_task_sighand(tsk, &flags)) + return; - check_process_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); + + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - lockdep_posixtimer_exit(); +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + __run_posix_cpu_timers(tsk); } /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0deaf4b79fb4..1c03eec6ca9b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, - * make it the final one one. + * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63a632f9896c..4c47f388a83f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset @@ -2001,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation + * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index bcbb52db2256..4ca2787d1642 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H -/* - * timekeeping debug functions - */ + #include <linux/clocksource.h> +#include <linux/spinlock.h> #include <linux/time.h> +/* + * timekeeping debug functions + */ #ifdef CONFIG_DEBUG_FS extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else @@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 026ac01af9da..a16764b0116e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -157,7 +157,8 @@ EXPORT_SYMBOL(jiffies_64); /* * The time start value for each level to select the bucket at enqueue - * time. + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) @@ -204,8 +205,8 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; + bool next_expiry_recalc; bool is_idle; - bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -488,35 +489,48 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) { + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static int calc_wheel_index(unsigned long expires, unsigned long clk) +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { - idx = calc_index(expires, 0); + idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { - idx = calc_index(expires, 1); + idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { - idx = calc_index(expires, 2); + idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { - idx = calc_index(expires, 3); + idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { - idx = calc_index(expires, 4); + idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { - idx = calc_index(expires, 5); + idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { - idx = calc_index(expires, 6); + idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { - idx = calc_index(expires, 7); + idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; + *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the @@ -525,34 +539,11 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; - idx = calc_index(expires, LVL_DEPTH - 1); + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } -/* - * Enqueue the timer into the hash bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ -static void enqueue_timer(struct timer_base *base, struct timer_list *timer, - unsigned int idx) -{ - hlist_add_head(&timer->entry, base->vectors + idx); - __set_bit(idx, base->pending_map); - timer_set_idx(timer, idx); - - trace_timer_start(timer, timer->expires, timer->flags); -} - -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) -{ - unsigned int idx; - - idx = calc_wheel_index(timer->expires, base->clk); - enqueue_timer(base, timer, idx); -} - static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { @@ -574,34 +565,48 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ - if (!base->is_idle) - return; + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); +} - /* Check whether this is the new first expiring timer: */ - if (time_after_eq(timer->expires, base->next_expiry)) - return; +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) +{ + + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); /* - * Set the next expiry time and kick the CPU so it can reevaluate the - * wheel: + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. */ - if (time_before(timer->expires, base->clk)) { + if (time_before(bucket_expiry, base->next_expiry)) { /* - * Prevent from forward_timer_base() moving the base->clk - * backward + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: */ - base->next_expiry = base->clk; - } else { - base->next_expiry = timer->expires; + base->next_expiry = bucket_expiry; + base->next_expiry_recalc = false; + trigger_dyntick_cpu(base, timer); } - wake_up_nohz_cpu(base->cpu); } -static void -internal_add_timer(struct timer_base *base, struct timer_list *timer) +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); - trigger_dyntick_cpu(base, timer); + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS @@ -834,8 +839,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, if (!timer_pending(timer)) return 0; - if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } detach_timer(timer, clear_pending); return 1; @@ -885,20 +892,14 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { -#ifdef CONFIG_NO_HZ_COMMON - unsigned long jnow; + unsigned long jnow = READ_ONCE(jiffies); /* - * We only forward the base when we are idle or have just come out of - * idle (must_forward_clk logic), and have a delta between base clock - * and jiffies. In the common case, run_timers will take care of it. + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jffies. */ - if (likely(!base->must_forward_clk)) - return; - - jnow = READ_ONCE(jiffies); - base->must_forward_clk = base->is_idle; - if ((long)(jnow - base->clk) < 2) + if ((long)(jnow - base->clk) < 1) return; /* @@ -912,7 +913,6 @@ static inline void forward_timer_base(struct timer_base *base) return; base->clk = base->next_expiry; } -#endif } @@ -960,9 +960,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { + unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; - unsigned long clk = 0, flags; int ret = 0; BUG_ON(!timer->function); @@ -1001,7 +1001,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } clk = base->clk; - idx = calc_wheel_index(expires, clk); + idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending @@ -1054,16 +1054,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only - * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise - * we need to (re)calculate the wheel index via - * internal_add_timer(). + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). */ - if (idx != UINT_MAX && clk == base->clk) { - enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer); - } else { + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else internal_add_timer(base, timer); - } out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1466,10 +1463,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int __collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - unsigned long clk = base->clk; + unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; @@ -1491,7 +1488,6 @@ static int __collect_expired_timers(struct timer_base *base, return levels; } -#ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level @@ -1524,6 +1520,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; @@ -1531,6 +1528,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; } /* * Clock for the next level. If the current level clock lower @@ -1568,13 +1572,17 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ - adj = clk & LVL_CLK_MASK ? 1 : 0; + adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } + + base->next_expiry_recalc = false; + return next; } +#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: @@ -1631,9 +1639,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; raw_spin_lock(&base->lock); - nextevt = __next_timer_interrupt(base); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); - base->next_expiry = nextevt; + /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk @@ -1659,10 +1669,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) { - base->must_forward_clk = true; + if ((expires - basem) > TICK_NSEC) base->is_idle = true; - } } raw_spin_unlock(&base->lock); @@ -1686,42 +1694,6 @@ void timer_clear_idle(void) */ base->is_idle = false; } - -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - unsigned long now = READ_ONCE(jiffies); - - /* - * NOHZ optimization. After a long idle sleep we need to forward the - * base to current jiffies. Avoid a loop by searching the bitfield for - * the next expiring timer. - */ - if ((long)(now - base->clk) > 2) { - unsigned long next = __next_timer_interrupt(base); - - /* - * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time: - */ - if (time_after(next, now)) { - /* - * The call site will increment base->clk and then - * terminate the expiry loop immediately. - */ - base->clk = now; - return 0; - } - base->clk = next; - } - return __collect_expired_timers(base, heads); -} -#else -static inline int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - return __collect_expired_timers(base, heads); -} #endif /* @@ -1761,32 +1733,23 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (!time_after_eq(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); - /* - * timer_base::must_forward_clk must be cleared before running - * timers so that any timer functions that call mod_timer() will - * not try to forward the base. Idle tracking / clock forwarding - * logic is only used with BASE_STD timers. - * - * The must_forward_clk flag is cleared unconditionally also for - * the deferrable base. The deferrable base is not affected by idle - * tracking and never forwarded, so clearing the flag is a NOOP. - * - * The fact that the deferrable base is never forwarded can cause - * large variations in granularity for deferrable timers, but they - * can be deferred for long periods due to idle anyway. - */ - base->must_forward_clk = false; - - while (time_after_eq(jiffies, base->clk)) { - + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); + /* + * The only possible reason for not finding any expired + * timer at this clk is that all matching timers have been + * dequeued. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; + base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); @@ -1816,12 +1779,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk)) { + if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); @@ -1986,7 +1949,6 @@ int timers_prepare_cpu(unsigned int cpu) base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->is_idle = false; - base->must_forward_clk = true; } return 0; } @@ -2039,6 +2001,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2054,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 54ce6eb2ca36..88e6b8ed6ca5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -13,6 +13,8 @@ #include <vdso/helpers.h> #include <vdso/vsyscall.h> +#include "timekeeping_internal.h" + static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { @@ -127,3 +129,42 @@ void update_vsyscall_tz(void) __arch_sync_vdso_data(vdata); } + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} |