aboutsummaryrefslogtreecommitdiff
path: root/kernel/time/timekeeping.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time/timekeeping.c')
-rw-r--r--kernel/time/timekeeping.c649
1 files changed, 323 insertions, 326 deletions
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..0ca85ff4fbb4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -30,8 +30,9 @@
#include "timekeeping_internal.h"
#define TK_CLEAR_NTP (1 << 0)
-#define TK_MIRROR (1 << 1)
-#define TK_CLOCK_WAS_SET (1 << 2)
+#define TK_CLOCK_WAS_SET (1 << 1)
+
+#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
@@ -41,20 +42,18 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
-DEFINE_RAW_SPINLOCK(timekeeper_lock);
-
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
-static struct {
+struct tk_data {
seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
-} tk_core ____cacheline_aligned = {
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
-};
+ struct timekeeper shadow_timekeeper;
+ raw_spinlock_t lock;
+} ____cacheline_aligned;
-static struct timekeeper shadow_timekeeper;
+static struct tk_data tk_core;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -114,6 +113,36 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+unsigned long timekeeper_lock_irqsave(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tk_core.lock, flags);
+ return flags;
+}
+
+void timekeeper_unlock_irqrestore(unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&tk_core.lock, flags);
+}
+
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -161,13 +190,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
tk->wall_to_monotonic = wtm;
set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
- tk->offs_real = timespec64_to_ktime(tmp);
- tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
+ /* Paired with READ_ONCE() in ktime_mono_to_any() */
+ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
+ WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
}
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
- tk->offs_boot = ktime_add(tk->offs_boot, delta);
+ /* Paired with READ_ONCE() in ktime_mono_to_any() */
+ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
/*
* Timespec representation for VDSO update to avoid 64bit division
* on every update.
@@ -184,7 +215,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
* the wrong clocksource is passed to the wrong read function.
- * This isn't necessary to use when holding the timekeeper_lock or doing
+ * This isn't necessary to use when holding the tk_core.lock or doing
* a read of the fast-timekeeper tkrs (which is protected by its own locking
* and update logic).
*/
@@ -195,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr)
return clock->read(clock);
}
-#ifdef CONFIG_DEBUG_TIMEKEEPING
-#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-
-static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-
- u64 max_cycles = tk->tkr_mono.clock->max_cycles;
- const char *name = tk->tkr_mono.clock->name;
-
- if (offset > max_cycles) {
- printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
- offset, name, max_cycles);
- printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
- } else {
- if (offset > (max_cycles >> 1)) {
- printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
- offset, name, max_cycles >> 1);
- printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
- }
- }
-
- if (tk->underflow_seen) {
- if (jiffies - tk->last_warning > WARNING_FREQ) {
- printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
- printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
- printk_deferred(" Your kernel is probably still fine.\n");
- tk->last_warning = jiffies;
- }
- tk->underflow_seen = 0;
- }
-
- if (tk->overflow_seen) {
- if (jiffies - tk->last_warning > WARNING_FREQ) {
- printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
- printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
- printk_deferred(" Your kernel is probably still fine.\n");
- tk->last_warning = jiffies;
- }
- tk->overflow_seen = 0;
- }
-}
-
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);
-
-static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- u64 now, last, mask, max, delta;
- unsigned int seq;
-
- /*
- * Since we're called holding a seqcount, the data may shift
- * under us while we're doing the calculation. This can cause
- * false positives, since we'd note a problem but throw the
- * results away. So nest another seqcount here to atomically
- * grab the points we are checking with.
- */
- do {
- seq = read_seqcount_begin(&tk_core.seq);
- now = tk_clock_read(tkr);
- last = tkr->cycle_last;
- mask = tkr->mask;
- max = tkr->clock->max_cycles;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- delta = clocksource_delta(now, last, mask);
-
- /*
- * Try to catch underflows by checking if we are seeing small
- * mask-relative negative values.
- */
- if (unlikely((~delta & mask) < (mask >> 3)))
- tk->underflow_seen = 1;
-
- /* Check for multiplication overflows */
- if (unlikely(delta > max))
- tk->overflow_seen = 1;
-
- /* timekeeping_cycles_to_ns() handles both under and overflow */
- return timekeeping_cycles_to_ns(tkr, now);
-}
-#else
-static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
-{
-}
-static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
-{
- BUG();
-}
-#endif
-
/**
* tk_setup_internals - Set up internals to use clocksource clock.
*
@@ -390,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}
-static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
+static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}
-static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
-{
- if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
- return timekeeping_debug_get_ns(tkr);
-
- return __timekeeping_get_ns(tkr);
-}
-
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
@@ -411,7 +343,7 @@ static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
- * Employ the latch technique; see @raw_write_seqcount_latch.
+ * Employ the latch technique; see @write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
@@ -424,16 +356,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch_begin(&tkf->seq);
/* Update base[0] */
memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
+
+ write_seqcount_latch_end(&tkf->seq);
}
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
@@ -443,11 +377,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
u64 now;
do {
- seq = raw_read_seqcount_latch(&tkf->seq);
+ seq = read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += __timekeeping_get_ns(tkr);
- } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
+ now += timekeeping_get_ns(tkr);
+ } while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
@@ -517,7 +451,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
* timekeeping_inject_sleeptime64()
* __timekeeping_inject_sleeptime(tk, delta);
* timestamp();
- * timekeeping_update(tk, TK_CLEAR_NTP...);
+ * timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
*
* (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
* partially updated. Since the tk->offs_boot update is a rare event, this
@@ -562,7 +496,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
tkr = tkf->base + (seq & 0x01);
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
- delta = __timekeeping_get_ns(tkr);
+ delta = timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
@@ -676,13 +610,11 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
int ret;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
update_pvclock_gtod(tk, true);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return ret;
}
@@ -695,14 +627,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
*/
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-
- return ret;
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
@@ -718,6 +644,18 @@ static inline void tk_update_leap_state(struct timekeeper *tk)
}
/*
+ * Leap state update for both shadow and the real timekeeper
+ * Separate to spare a full memcpy() of the timekeeper.
+ */
+static void tk_update_leap_state_all(struct tk_data *tkd)
+{
+ write_seqcount_begin(&tkd->seq);
+ tk_update_leap_state(&tkd->shadow_timekeeper);
+ tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
+ write_seqcount_end(&tkd->seq);
+}
+
+/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
@@ -750,9 +688,30 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
-/* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, unsigned int action)
+/*
+ * Restore the shadow timekeeper from the real timekeeper.
+ */
+static void timekeeping_restore_shadow(struct tk_data *tkd)
{
+ lockdep_assert_held(&tkd->lock);
+ memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
+}
+
+static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
+{
+ struct timekeeper *tk = &tk_core.shadow_timekeeper;
+
+ lockdep_assert_held(&tkd->lock);
+
+ /*
+ * Block out readers before running the updates below because that
+ * updates VDSO and other time related infrastructure. Not blocking
+ * the readers might let a reader see time going backwards when
+ * reading from the VDSO after the VDSO update and then reading in
+ * the kernel from the timekeeper before that got updated.
+ */
+ write_seqcount_begin(&tkd->seq);
+
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
ntp_clear();
@@ -770,14 +729,17 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
+
/*
- * The mirroring of the data to the shadow-timekeeper needs
- * to happen last here to ensure we don't over-write the
- * timekeeper structure on the next update with stale data
+ * Update the real timekeeper.
+ *
+ * We could avoid this memcpy() by switching pointers, but that has
+ * the downside that the reader side does not longer benefit from
+ * the cacheline optimized data layout of the timekeeper and requires
+ * another indirection.
*/
- if (action & TK_MIRROR)
- memcpy(&shadow_timekeeper, &tk_core.timekeeper,
- sizeof(tk_core.timekeeper));
+ memcpy(&tkd->timekeeper, tk, sizeof(*tk));
+ write_seqcount_end(&tkd->seq);
}
/**
@@ -930,6 +892,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
unsigned int seq;
ktime_t tconv;
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ /*
+ * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
+ * tk_update_sleep_time().
+ */
+ return ktime_add(tmono, READ_ONCE(*offset));
+ }
+
do {
seq = read_seqcount_begin(&tk_core.seq);
tconv = ktime_add(tmono, *offset);
@@ -1060,6 +1030,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
unsigned int seq;
ktime_t base_raw;
ktime_t base_real;
+ ktime_t base_boot;
u64 nsec_raw;
u64 nsec_real;
u64 now;
@@ -1074,6 +1045,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
+ base_boot = ktime_add(tk->tkr_mono.base,
+ tk_core.timekeeper.offs_boot);
base_raw = tk->tkr_raw.base;
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
@@ -1081,6 +1054,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
systime_snapshot->cycles = now;
systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+ systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);
@@ -1440,45 +1414,35 @@ EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);
*/
int do_settimeofday64(const struct timespec64 *ts)
{
- struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
- unsigned long flags;
- int ret = 0;
if (!timespec64_valid_settod(ts))
return -EINVAL;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
- timekeeping_forward_now(tk);
+ timekeeping_forward_now(tks);
- xt = tk_xtime(tk);
- ts_delta = timespec64_sub(*ts, xt);
+ xt = tk_xtime(tks);
+ ts_delta = timespec64_sub(*ts, xt);
- if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
- ret = -EINVAL;
- goto out;
- }
-
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
-
- tk_set_xtime(tk, ts);
-out:
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
+ timekeeping_restore_shadow(&tk_core);
+ return -EINVAL;
+ }
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
+ tk_set_xtime(tks, ts);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ }
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL);
- if (!ret) {
- audit_tk_injoffset(ts_delta);
- add_device_randomness(ts, sizeof(*ts));
- }
-
- return ret;
+ audit_tk_injoffset(ts_delta);
+ add_device_randomness(ts, sizeof(*ts));
+ return 0;
}
EXPORT_SYMBOL(do_settimeofday64);
@@ -1490,40 +1454,31 @@ EXPORT_SYMBOL(do_settimeofday64);
*/
static int timekeeping_inject_offset(const struct timespec64 *ts)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
- struct timespec64 tmp;
- int ret = 0;
-
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- timekeeping_forward_now(tk);
-
- /* Make sure the proposed value is valid */
- tmp = timespec64_add(tk_xtime(tk), *ts);
- if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
- !timespec64_valid_settod(&tmp)) {
- ret = -EINVAL;
- goto error;
- }
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
+ struct timespec64 tmp;
- tk_xtime_add(tk, ts);
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));
+ timekeeping_forward_now(tks);
-error: /* even if we error out, we forwarded the time, so call update */
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ /* Make sure the proposed value is valid */
+ tmp = timespec64_add(tk_xtime(tks), *ts);
+ if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
+ !timespec64_valid_settod(&tmp)) {
+ timekeeping_restore_shadow(&tk_core);
+ return -EINVAL;
+ }
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ tk_xtime_add(tks, ts);
+ tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ }
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL);
-
- return ret;
+ return 0;
}
/*
@@ -1576,43 +1531,34 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
*/
static int change_clocksource(void *data)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *new, *old = NULL;
- unsigned long flags;
- bool change = false;
-
- new = (struct clocksource *) data;
+ struct clocksource *new = data, *old = NULL;
/*
- * If the cs is in module, get a module reference. Succeeds
- * for built-in code (owner == NULL) as well.
+ * If the clocksource is in a module, get a module reference.
+ * Succeeds for built-in code (owner == NULL) as well. Abort if the
+ * reference can't be acquired.
*/
- if (try_module_get(new->owner)) {
- if (!new->enable || new->enable(new) == 0)
- change = true;
- else
- module_put(new->owner);
- }
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- timekeeping_forward_now(tk);
+ if (!try_module_get(new->owner))
+ return 0;
- if (change) {
- old = tk->tkr_mono.clock;
- tk_setup_internals(tk, new);
+ /* Abort if the device can't be enabled */
+ if (new->enable && new->enable(new) != 0) {
+ module_put(new->owner);
+ return 0;
}
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_forward_now(tks);
+ old = tks->tkr_mono.clock;
+ tk_setup_internals(tks, new);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ }
if (old) {
if (old->disable)
old->disable(old);
-
module_put(old->owner);
}
@@ -1737,6 +1683,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
*boot_offset = ns_to_timespec64(local_clock());
}
+static __init void tkd_basic_setup(struct tk_data *tkd)
+{
+ raw_spin_lock_init(&tkd->lock);
+ seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+}
+
/*
* Flag reflecting whether timekeeping_resume() has injected sleeptime.
*
@@ -1761,9 +1713,10 @@ static bool persistent_clock_exists;
void __init timekeeping_init(void)
{
struct timespec64 wall_time, boot_offset, wall_to_mono;
- struct timekeeper *tk = &tk_core.timekeeper;
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
struct clocksource *clock;
- unsigned long flags;
+
+ tkd_basic_setup(&tk_core);
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
@@ -1783,24 +1736,21 @@ void __init timekeeping_init(void)
*/
wall_to_mono = timespec64_sub(boot_offset, wall_time);
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+
ntp_init();
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
- tk_setup_internals(tk, clock);
+ tk_setup_internals(tks, clock);
- tk_set_xtime(tk, &wall_time);
- tk->raw_sec = 0;
+ tk_set_xtime(tks, &wall_time);
+ tks->raw_sec = 0;
- tk_set_wall_to_mono(tk, wall_to_mono);
+ tk_set_wall_to_mono(tks, wall_to_mono);
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
}
/* time in seconds when suspend began for persistent clock */
@@ -1878,22 +1828,14 @@ bool timekeeping_rtc_skipsuspend(void)
*/
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- suspend_timing_needed = false;
+ scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
- timekeeping_forward_now(tk);
-
- __timekeeping_inject_sleeptime(tk, delta);
-
- timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ suspend_timing_needed = false;
+ timekeeping_forward_now(tks);
+ __timekeeping_inject_sleeptime(tks, delta);
+ timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ }
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
@@ -1905,20 +1847,19 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
*/
void timekeeping_resume(void)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *clock = tk->tkr_mono.clock;
- unsigned long flags;
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
+ struct clocksource *clock = tks->tkr_mono.clock;
struct timespec64 ts_new, ts_delta;
- u64 cycle_now, nsec;
bool inject_sleeptime = false;
+ u64 cycle_now, nsec;
+ unsigned long flags;
read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ raw_spin_lock_irqsave(&tk_core.lock, flags);
/*
* After system resumes, we need to calculate the suspended time and
@@ -1932,7 +1873,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk_clock_read(&tk->tkr_mono);
+ cycle_now = tk_clock_read(&tks->tkr_mono);
nsec = clocksource_stop_suspend_timing(clock, cycle_now);
if (nsec > 0) {
ts_delta = ns_to_timespec64(nsec);
@@ -1944,18 +1885,17 @@ void timekeeping_resume(void)
if (inject_sleeptime) {
suspend_timing_needed = false;
- __timekeeping_inject_sleeptime(tk, &ts_delta);
+ __timekeeping_inject_sleeptime(tks, &ts_delta);
}
/* Re-base the last cycle value */
- tk->tkr_mono.cycle_last = cycle_now;
- tk->tkr_raw.cycle_last = cycle_now;
+ tks->tkr_mono.cycle_last = cycle_now;
+ tks->tkr_raw.cycle_last = cycle_now;
- tk->ntp_error = 0;
+ tks->ntp_error = 0;
timekeeping_suspended = 0;
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
+ raw_spin_unlock_irqrestore(&tk_core.lock, flags);
touch_softlockup_watchdog();
@@ -1967,11 +1907,11 @@ void timekeeping_resume(void)
int timekeeping_suspend(void)
{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
- struct timespec64 delta, delta_delta;
- static struct timespec64 old_delta;
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
+ struct timespec64 delta, delta_delta;
+ static struct timespec64 old_delta;
struct clocksource *curr_clock;
+ unsigned long flags;
u64 cycle_now;
read_persistent_clock64(&timekeeping_suspend_time);
@@ -1986,9 +1926,8 @@ int timekeeping_suspend(void)
suspend_timing_needed = true;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
- timekeeping_forward_now(tk);
+ raw_spin_lock_irqsave(&tk_core.lock, flags);
+ timekeeping_forward_now(tks);
timekeeping_suspended = 1;
/*
@@ -1996,8 +1935,8 @@ int timekeeping_suspend(void)
* just read from the current clocksource. Save this to potentially
* use in suspend timing.
*/
- curr_clock = tk->tkr_mono.clock;
- cycle_now = tk->tkr_mono.cycle_last;
+ curr_clock = tks->tkr_mono.clock;
+ cycle_now = tks->tkr_mono.cycle_last;
clocksource_start_suspend_timing(curr_clock, cycle_now);
if (persistent_clock_exists) {
@@ -2007,7 +1946,7 @@ int timekeeping_suspend(void)
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
- delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
delta_delta = timespec64_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
@@ -2022,10 +1961,9 @@ int timekeeping_suspend(void)
}
}
- timekeeping_update(tk, TK_MIRROR);
- halt_fast_timekeeper(tk);
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, 0);
+ halt_fast_timekeeper(tks);
+ raw_spin_unlock_irqrestore(&tk_core.lock, flags);
tick_suspend();
clocksource_suspend();
@@ -2130,16 +2068,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
+ u64 ntp_tl = ntp_tick_length();
u32 mult;
/*
* Determine the multiplier from the current NTP tick length.
* Avoid expensive division when the tick length doesn't change.
*/
- if (likely(tk->ntp_tick == ntp_tick_length())) {
+ if (likely(tk->ntp_tick == ntp_tl)) {
mult = tk->tkr_mono.mult - tk->ntp_err_mult;
} else {
- tk->ntp_tick = ntp_tick_length();
+ tk->ntp_tick = ntp_tl;
mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
tk->xtime_remainder, tk->cycle_interval);
}
@@ -2278,28 +2217,24 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
*/
static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
+ struct timekeeper *tk = &tk_core.shadow_timekeeper;
struct timekeeper *real_tk = &tk_core.timekeeper;
- struct timekeeper *tk = &shadow_timekeeper;
- u64 offset;
- int shift = 0, maxshift;
unsigned int clock_set = 0;
- unsigned long flags;
+ int shift = 0, maxshift;
+ u64 offset;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
- goto out;
+ return false;
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
- goto out;
-
- /* Do some additional sanity checking */
- timekeeping_check_update(tk, offset);
+ return false;
/*
* With NO_HZ we may have to accumulate many cycle_intervals
@@ -2315,8 +2250,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
- offset = logarithmic_accumulation(tk, offset, shift,
- &clock_set);
+ offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
if (offset < tk->cycle_interval<<shift)
shift--;
}
@@ -2330,23 +2264,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- write_seqcount_begin(&tk_core.seq);
- /*
- * Update the real timekeeper.
- *
- * We could avoid this memcpy by switching pointers, but that
- * requires changes to all other timekeeper usage sites as
- * well, i.e. move the timekeeper pointer getter into the
- * spinlocked/seqcount protected sections. And we trade this
- * memcpy under the tk_core.seq against one before we start
- * updating.
- */
- timekeeping_update(tk, clock_set);
- memcpy(real_tk, tk, sizeof(*tk));
- /* The memcpy must come last. Do not put anything here! */
- write_seqcount_end(&tk_core.seq);
-out:
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ timekeeping_update_from_shadow(&tk_core, clock_set);
return !!clock_set;
}
@@ -2394,6 +2312,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts: timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ u64 floor = atomic64_read(&mg_floor);
+ ktime_t f_real, offset, coarse;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ *ts = tk_xtime(tk);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ coarse = timespec64_to_ktime(*ts);
+ f_real = ktime_add(floor, offset);
+ if (ktime_after(f_real, coarse))
+ *ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts: pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor. Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t old = atomic64_read(&mg_floor);
+ ktime_t offset, mono;
+ unsigned int seq;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ mono = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ mono = ktime_add_ns(mono, nsecs);
+
+ /*
+ * Attempt to update the floor with the new time value. As any
+ * update must be later then the existing floor, and would effect
+ * a change to ctime from the perspective of the current task,
+ * accept the resulting floor value regardless of the outcome of
+ * the swap.
+ */
+ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+ timekeeping_inc_mg_floor_swaps();
+ } else {
+ /*
+ * Another task changed mg_floor since "old" was fetched.
+ * "old" has been updated with the latest value of "mg_floor".
+ * That value is newer than the previous floor value, which
+ * is enough to effect a change to ctime. Accept it.
+ */
+ *ts = ktime_to_timespec64(ktime_add(old, offset));
+ }
+}
+
void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
@@ -2551,13 +2557,10 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
*/
int do_adjtimex(struct __kernel_timex *txc)
{
- struct timekeeper *tk = &tk_core.timekeeper;
struct audit_ntp_data ad;
bool offset_set = false;
bool clock_set = false;
struct timespec64 ts;
- unsigned long flags;
- s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
@@ -2568,6 +2571,7 @@ int do_adjtimex(struct __kernel_timex *txc)
if (txc->modes & ADJ_SETOFFSET) {
struct timespec64 delta;
+
delta.tv_sec = txc->time.tv_sec;
delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
@@ -2585,21 +2589,21 @@ int do_adjtimex(struct __kernel_timex *txc)
ktime_get_real_ts64(&ts);
add_device_randomness(&ts, sizeof(ts));
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
+ struct timekeeper *tks = &tk_core.shadow_timekeeper;
+ s32 orig_tai, tai;
- orig_tai = tai = tk->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai, &ad);
+ orig_tai = tai = tks->tai_offset;
+ ret = __do_adjtimex(txc, &ts, &tai, &ad);
- if (tai != orig_tai) {
- __timekeeping_set_tai_offset(tk, tai);
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- clock_set = true;
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tks, tai);
+ timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
+ clock_set = true;
+ } else {
+ tk_update_leap_state_all(&tk_core);
+ }
}
- tk_update_leap_state(tk);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
audit_ntp_log(&ad);
@@ -2623,15 +2627,8 @@ int do_adjtimex(struct __kernel_timex *txc)
*/
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
__hardpps(phase_ts, raw_ts);
-
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */