diff options
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/Makefile | 1 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 141 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 11 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 270 | ||||
-rw-r--r-- | kernel/time/itimer.c | 207 | ||||
-rw-r--r-- | kernel/time/namespace.c | 468 | ||||
-rw-r--r-- | kernel/time/ntp.c | 2 | ||||
-rw-r--r-- | kernel/time/posix-clock.c | 39 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 1042 | ||||
-rw-r--r-- | kernel/time/posix-stubs.c | 18 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 149 | ||||
-rw-r--r-- | kernel/time/posix-timers.h | 8 | ||||
-rw-r--r-- | kernel/time/sched_clock.c | 9 | ||||
-rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 59 | ||||
-rw-r--r-- | kernel/time/tick-common.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 44 | ||||
-rw-r--r-- | kernel/time/time.c | 116 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 5 | ||||
-rw-r--r-- | kernel/time/timer.c | 113 | ||||
-rw-r--r-- | kernel/time/vsyscall.c | 64 |
20 files changed, 1839 insertions, 929 deletions
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1867044800bb..c8f00168afe8 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..2ffb466af77e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,7 @@ #include <linux/freezer.h> #include <linux/compat.h> #include <linux/module.h> +#include <linux/time_namespace.h> #include "posix-timers.h" @@ -36,13 +37,15 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @gettime: Function to read the time correlating to the base + * @get_ktime: Function to read the time correlating to the base + * @get_timespec: Function to read the namespace time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - ktime_t (*gettime)(void); + ktime_t (*get_ktime)(void); + void (*get_timespec)(struct timespec64 *tp); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -55,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock); #endif #ifdef CONFIG_RTC_CLASS -static struct wakeup_source *ws; - /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; @@ -66,8 +67,6 @@ static DEFINE_SPINLOCK(rtcdev_lock); * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. - * If one has not already been chosen, it checks to see if a - * functional rtc device is available. */ struct rtc_device *alarmtimer_get_rtcdev(void) { @@ -87,7 +86,8 @@ static int alarmtimer_rtc_add_device(struct device *dev, { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); - struct wakeup_source *__ws; + struct platform_device *pdev; + int ret = 0; if (rtcdev) return -EBUSY; @@ -97,26 +97,31 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; - __ws = wakeup_source_register("alarmtimer"); + pdev = platform_device_register_data(dev, "alarmtimer", + PLATFORM_DEVID_AUTO, NULL, 0); + if (!IS_ERR(pdev)) + device_init_wakeup(&pdev->dev, true); spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { + if (!IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { - spin_unlock_irqrestore(&rtcdev_lock, flags); - return -1; + ret = -1; + goto unlock; } rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); - ws = __ws; - __ws = NULL; + pdev = NULL; + } else { + ret = -1; } +unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); - wakeup_source_unregister(__ws); + platform_device_unregister(pdev); - return 0; + return ret; } static inline void alarmtimer_rtc_timer_init(void) @@ -138,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} -#define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } @@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - restart = alarm->function(alarm, base->gettime()); + restart = alarm->function(alarm, base->get_ktime()); spin_lock_irqsave(&base->lock, flags); if (restart != ALARMTIMER_NORESTART) { @@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_fired(alarm, base->gettime()); + trace_alarmtimer_fired(alarm, base->get_ktime()); return ret; } @@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) ktime_t alarm_expires_remaining(const struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - return ktime_sub(alarm->node.expires, base->gettime()); + return ktime_sub(alarm->node.expires, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_expires_remaining); @@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev) spin_unlock_irqrestore(&base->lock, flags); if (!next) continue; - delta = ktime_sub(next->expires, base->gettime()); + delta = ktime_sub(next->expires, base->get_ktime()); if (!min || (delta < min)) { expires = next->expires; min = delta; @@ -281,7 +281,7 @@ static int alarmtimer_suspend(struct device *dev) return 0; if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { - __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + pm_wakeup_event(dev, 2 * MSEC_PER_SEC); return -EBUSY; } @@ -296,7 +296,7 @@ static int alarmtimer_suspend(struct device *dev) /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, 0); if (ret < 0) - __pm_wakeup_event(ws, MSEC_PER_SEC); + pm_wakeup_event(dev, MSEC_PER_SEC); return ret; } @@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start) hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_start(alarm, base->gettime()); + trace_alarmtimer_start(alarm, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add_safe(start, base->gettime()); + start = ktime_add_safe(start, base->get_ktime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_cancel(alarm, base->gettime()); + trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm) int ret = alarm_try_to_cancel(alarm); if (ret >= 0) return ret; - cpu_relax(); + hrtimer_cancel_wait_running(&alarm->timer); } } EXPORT_SYMBOL_GPL(alarm_cancel); @@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - return alarm_forward(alarm, base->gettime(), interval); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) return; } - delta = ktime_sub(absexp, base->gettime()); + delta = ktime_sub(absexp, base->get_ktime()); spin_lock_irqsave(&freezer_delta_lock, flags); if (!freezer_delta || (delta < freezer_delta)) { @@ -606,6 +606,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr) } /** + * alarm_timer_wait_running - Posix timer callback to wait for a timer + * @timr: Pointer to the posixtimer data struct + * + * Called from the core code when timer cancel detected that the callback + * is running. @timr is unlocked and rcu read lock is held to prevent it + * from being freed. + */ +static void alarm_timer_wait_running(struct k_itimer *timr) +{ + hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer); +} + +/** * alarm_timer_arm - Posix timer callback to arm a timer * @timr: Pointer to the posixtimer data struct * @expires: The new expiry time @@ -619,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, struct alarm_base *base = &alarm_bases[alarm->type]; if (!absolute) - expires = ktime_add_safe(expires, base->gettime()); + expires = ktime_add_safe(expires, base->get_ktime()); if (sigev_none) alarm->node.expires = expires; else @@ -644,24 +657,41 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get interface + * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * - * Provides the underlying alarm base time. + * Provides the underlying alarm base time in a tasks time namespace. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->gettime()); + base->get_timespec(tp); + return 0; } /** + * alarm_clock_get_ktime - posix clock_get_ktime interface + * @which_clock: clockid + * + * Provides the underlying alarm base time in the root namespace. + */ +static ktime_t alarm_clock_get_ktime(clockid_t which_clock) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return base->get_ktime(); +} + +/** * alarm_timer_create - posix timer_create interface * @new_timer: k_itimer pointer to manage * @@ -672,7 +702,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) enum alarmtimer_type type; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!capable(CAP_WAKE_ALARM)) return -EPERM; @@ -734,7 +764,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, struct timespec64 rmt; ktime_t rem; - rem = ktime_sub(absexp, alarm_bases[type].gettime()); + rem = ktime_sub(absexp, alarm_bases[type].get_ktime()); if (rem <= 0) return 0; @@ -790,7 +820,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, int ret = 0; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (flags & ~TIMER_ABSTIME) return -EINVAL; @@ -803,9 +833,11 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { - ktime_t now = alarm_bases[type].gettime(); + ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); + } else { + exp = timens_ktime_to_host(which_clock, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); @@ -824,7 +856,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, + .clock_get_ktime = alarm_clock_get_ktime, + .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, @@ -834,6 +867,7 @@ const struct k_clock alarm_clock = { .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, .timer_try_to_cancel = alarm_timer_try_to_cancel, + .timer_wait_running = alarm_timer_wait_running, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ @@ -852,6 +886,12 @@ static struct platform_driver alarmtimer_driver = { } }; +static void get_boottime_timespec(struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); +} + /** * alarmtimer_init - Initialize alarm timer code * @@ -860,17 +900,18 @@ static struct platform_driver alarmtimer_driver = { */ static int __init alarmtimer_init(void) { - struct platform_device *pdev; - int error = 0; + int error; int i; alarmtimer_rtc_timer_init(); /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; - alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; - alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); @@ -884,15 +925,7 @@ static int __init alarmtimer_init(void) if (error) goto out_if; - pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); - if (IS_ERR(pdev)) { - error = PTR_ERR(pdev); - goto out_drv; - } return 0; - -out_drv: - platform_driver_unregister(&alarmtimer_driver); out_if: alarmtimer_rtc_interface_remove(); return error; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fff5f64981c6..428beb69426a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused) next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); + } out: spin_unlock(&watchdog_lock); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5ee77f1a8a92..3a609e7344f3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} + /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -159,7 +164,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, struct hrtimer_clock_base *base; for (;;) { - base = timer->base; + base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -239,7 +244,7 @@ again: return base; /* See the comment in lock_hrtimer_base() */ - timer->base = &migration_base; + WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -248,10 +253,10 @@ again: raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; - timer->base = base; + WRITE_ONCE(timer->base, base); goto again; } - timer->base = new_base; + WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { @@ -264,6 +269,11 @@ again: #else /* CONFIG_SMP */ +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} + static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { @@ -427,6 +437,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode); + +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); + __hrtimer_init_sleeper(sl, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); + void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -945,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -967,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -992,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { int reprogram; /* @@ -1096,9 +1120,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft - * match. + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. */ - WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); @@ -1147,6 +1175,93 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); +#ifdef CONFIG_PREEMPT_RT +static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) +{ + spin_lock_init(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) +{ + spin_lock(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) +{ + spin_unlock(&base->softirq_expiry_lock); +} + +/* + * The counterpart to hrtimer_cancel_wait_running(). + * + * If there is a waiter for cpu_base->expiry_lock, then it was waiting for + * the timer callback to finish. Drop expiry_lock and reaquire it. That + * allows the waiter to acquire the lock and make progress. + */ +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, + unsigned long flags) +{ + if (atomic_read(&cpu_base->timer_waiters)) { + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + spin_unlock(&cpu_base->softirq_expiry_lock); + spin_lock(&cpu_base->softirq_expiry_lock); + raw_spin_lock_irq(&cpu_base->lock); + } +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion: if the soft irq thread is preempted + * in the middle of a timer callback, then calling del_timer_sync() can + * lead to two issues: + * + * - If the caller is on a remote CPU then it has to spin wait for the timer + * handler to complete. This can result in unbound priority inversion. + * + * - If the caller originates from the task which preempted the timer + * handler on the same CPU, then spin waiting for the timer handler to + * complete is never going to end. + */ +void hrtimer_cancel_wait_running(const struct hrtimer *timer) +{ + /* Lockless read. Prevent the compiler from reloading it below */ + struct hrtimer_clock_base *base = READ_ONCE(timer->base); + + /* + * Just relax if the timer expires in hard interrupt context or if + * it is currently on the migration base. + */ + if (!timer->is_soft || is_migration_base(base)) { + cpu_relax(); + return; + } + + /* + * Mark the base as contended and grab the expiry lock, which is + * held by the softirq across the timer callback. Drop the lock + * immediately so the softirq can expire the next timer. In theory + * the timer could already be running again, but that's more than + * unlikely and just causes another wait loop. + */ + atomic_inc(&base->cpu_base->timer_waiters); + spin_lock_bh(&base->cpu_base->softirq_expiry_lock); + atomic_dec(&base->cpu_base->timer_waiters); + spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); +} +#else +static inline void +hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, + unsigned long flags) { } +#endif + /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled @@ -1157,13 +1272,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); */ int hrtimer_cancel(struct hrtimer *timer) { - for (;;) { - int ret = hrtimer_try_to_cancel(timer); + int ret; - if (ret >= 0) - return ret; - cpu_relax(); - } + do { + ret = hrtimer_try_to_cancel(timer); + + if (ret < 0) + hrtimer_cancel_wait_running(timer); + } while (ret < 0); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1260,8 +1377,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); - int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; struct hrtimer_cpu_base *cpu_base; + int base; + + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * marked for hard interrupt expiry mode are moved into soft + * interrupt context for latency reasons and because the callbacks + * can invoke functions which might sleep on RT, e.g. spin_lock(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) + softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); @@ -1275,8 +1401,10 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; + timer->is_hard = !softtimer; timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1349,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, - unsigned long flags) + unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1449,6 +1577,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); + if (active_mask == HRTIMER_ACTIVE_SOFT) + hrtimer_sync_wait_running(cpu_base, flags); } } } @@ -1459,6 +1589,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) unsigned long flags; ktime_t now; + hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); @@ -1468,6 +1599,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1639,10 +1771,75 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +/** + * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer + * @sl: sleeper to be started + * @mode: timer mode abs/rel + * + * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers + * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) + */ +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, + enum hrtimer_mode mode) +{ + /* + * Make the enqueue delivery mode check work on RT. If the sleeper + * was initialized for hard interrupt delivery, force the mode bit. + * This is a special case for hrtimer_sleepers because + * hrtimer_init_sleeper() determines the delivery mode on RT so the + * fiddling with this decision is avoided at the call sites. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) + mode |= HRTIMER_MODE_HARD; + + hrtimer_start_expires(&sl->timer, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); + +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * marked for hard interrupt expiry mode are moved into soft + * interrupt context either for latency reasons or because the + * hrtimer callback takes regular spinlocks or invokes other + * functions which are not suitable for hard interrupt context on + * PREEMPT_RT. + * + * The hrtimer_sleeper callback is RT compatible in hard interrupt + * context, but there is a latency concern: Untrusted userspace can + * spawn many threads which arm timers for the same expiry time on + * the same CPU. That causes a latency spike due to the wakeup of + * a gazillion threads. + * + * OTOH, priviledged real-time user space applications rely on the + * low latency of hard interrupt wakeups. If the current task is in + * a real-time scheduling class, mark the mode for hard interrupt + * expiry. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) + mode |= HRTIMER_MODE_HARD; + } + + __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; - sl->task = task; + sl->task = current; +} + +/** + * hrtimer_init_sleeper - initialize sleeper to the given clock + * @sl: sleeper to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(&sl->timer, clock_id, mode); + __hrtimer_init_sleeper(sl, clock_id, mode); + } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); @@ -1669,11 +1866,9 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod { struct restart_block *restart; - hrtimer_init_sleeper(t, current); - do { set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t->timer, mode); + hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) freezable_schedule(); @@ -1707,17 +1902,16 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); + hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; @@ -1728,8 +1922,8 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, if (dl_task(current) || rt_task(current)) slack = 0; - hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_init_sleeper_on_stack(&t, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1749,7 +1943,7 @@ out: return ret; } -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) @@ -1764,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1784,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1809,6 +2005,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -1927,12 +2124,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, clock_id, mode); + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - - hrtimer_init_sleeper(&t, current); - - hrtimer_start_expires(&t.timer, mode); + hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 02068b2d5862..ca4e6d57d68b 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ * Returns the delta between the expiry time and now, which can be * less than zero or 1usec for an pending expired timer */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) +static struct timespec64 itimer_get_remtime(struct hrtimer *timer) { ktime_t rem = __hrtimer_get_remaining(timer, true); @@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) } else rem = 0; - return ktime_to_timeval(rem); + return ktime_to_timespec64(rem); } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *const value) + struct itimerspec64 *const value) { u64 val, interval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, val = it->expires; interval = it->incr; if (val) { - struct task_cputime cputime; - u64 t; + u64 t, samples[CPUCLOCK_MAX]; - thread_group_cputimer(tsk, &cputime); - if (clock_id == CPUCLOCK_PROF) - t = cputime.utime + cputime.stime; - else - /* CPUCLOCK_VIRT */ - t = cputime.utime; + thread_group_sample_cputime(tsk, samples); + t = samples[clock_id]; if (val < t) /* about to fire */ @@ -74,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); - value->it_value = ns_to_timeval(val); - value->it_interval = ns_to_timeval(interval); + value->it_value = ns_to_timespec64(val); + value->it_interval = ns_to_timespec64(interval); } -int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerspec64 *value) { struct task_struct *tsk = current; @@ -87,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); + ktime_to_timespec64(tsk->signal->it_real_incr); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: @@ -102,34 +97,59 @@ int do_getitimer(int which, struct itimerval *value) return 0; } -SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) +static int put_itimerval(struct __kernel_old_itimerval __user *o, + const struct itimerspec64 *i) { - int error = -EFAULT; - struct itimerval get_buffer; + struct __kernel_old_itimerval v; - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } + v.it_interval.tv_sec = i->it_interval.tv_sec; + v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v.it_value.tv_sec = i->it_value.tv_sec; + v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v, sizeof(struct __kernel_old_itimerval)) ? -EFAULT : 0; +} + + +SYSCALL_DEFINE2(getitimer, int, which, struct __kernel_old_itimerval __user *, value) +{ + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); + + if (!error && put_itimerval(value, &get_buffer)) + error = -EFAULT; return error; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) +struct old_itimerval32 { + struct old_timeval32 it_interval; + struct old_timeval32 it_value; +}; + +static int put_old_itimerval32(struct old_itimerval32 __user *o, + const struct itimerspec64 *i) +{ + struct old_itimerval32 v32; + + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; +} + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) + struct old_itimerval32 __user *, value) { - struct itimerval kit; - int error = do_getitimer(which, &kit); + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (!error && put_compat_itimerval(it, &kit)) + if (!error && put_old_itimerval32(value, &get_buffer)) error = -EFAULT; return error; } #endif - /* * The timer is automagically restarted, when interval != 0 */ @@ -146,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - const struct itimerval *const value, - struct itimerval *const ovalue) + const struct itimerspec64 *const value, + struct itimerspec64 *const ovalue) { u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -156,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, * Use the to_ktime conversion because that clamps the maximum * value to KTIME_MAX and avoid multiplication overflows. */ - nval = ktime_to_ns(timeval_to_ktime(value->it_value)); - ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); + nval = timespec64_to_ns(&value->it_value); + ninterval = timespec64_to_ns(&value->it_interval); spin_lock_irq(&tsk->sighand->siglock); @@ -176,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { - ovalue->it_value = ns_to_timeval(oval); - ovalue->it_interval = ns_to_timeval(ointerval); + ovalue->it_value = ns_to_timespec64(oval); + ovalue->it_interval = ns_to_timespec64(ointerval); } } @@ -187,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerspec64 *value, + struct itimerspec64 *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - switch (which) { case ITIMER_REAL: again: @@ -208,17 +222,18 @@ again: if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); + = ktime_to_timespec64(tsk->signal->it_real_incr); } /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_cancel_wait_running(timer); goto again; } - expires = timeval_to_ktime(value->it_value); + expires = timespec64_to_ktime(value->it_value); if (expires != 0) { tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); + timespec64_to_ktime(value->it_interval); hrtimer_start(timer, expires, HRTIMER_MODE_REL); } else tsk->signal->it_real_incr = 0; @@ -238,6 +253,17 @@ again: return 0; } +#ifdef CONFIG_SECURITY_SELINUX +void clear_itimer(void) +{ + struct itimerspec64 v = {}; + int i; + + for (i = 0; i < 3; i++) + do_setitimer(i, &v, NULL); +} +#endif + #ifdef __ARCH_WANT_SYS_ALARM /** @@ -254,15 +280,15 @@ again: */ static unsigned int alarm_setitimer(unsigned int seconds) { - struct itimerval it_new, it_old; + struct itimerspec64 it_new, it_old; #if BITS_PER_LONG < 64 if (seconds > INT_MAX) seconds = INT_MAX; #endif it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_nsec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0; do_setitimer(ITIMER_REAL, &it_new, &it_old); @@ -270,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds) * We can't return 0 if we have an alarm pending ... And we'd * better return too much than too little anyway */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || + it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2)) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; @@ -288,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, - struct itimerval __user *, ovalue) +static int get_itimerval(struct itimerspec64 *o, const struct __kernel_old_itimerval __user *i) { - struct itimerval set_buffer, get_buffer; + struct __kernel_old_itimerval v; + + if (copy_from_user(&v, i, sizeof(struct __kernel_old_itimerval))) + return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v.it_value) || + !timeval_valid(&v.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v.it_interval.tv_sec; + o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v.it_value.tv_sec; + o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC; + return 0; +} + +SYSCALL_DEFINE3(setitimer, int, which, struct __kernel_old_itimerval __user *, value, + struct __kernel_old_itimerval __user *, ovalue) +{ + struct itimerspec64 set_buffer, get_buffer; int error; if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; + error = get_itimerval(&set_buffer, value); + if (error) + return error; } else { memset(&set_buffer, 0, sizeof(set_buffer)); printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." @@ -308,30 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (error || !ovalue) return error; - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + if (put_itimerval(ovalue, &get_buffer)) + return -EFAULT; + return 0; +} + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) +static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i) +{ + struct old_itimerval32 v32; + + if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v32.it_value) || + !timeval_valid(&v32.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC; return 0; } -#ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) + struct old_itimerval32 __user *, value, + struct old_itimerval32 __user *, ovalue) { - struct itimerval kin, kout; + struct itimerspec64 set_buffer, get_buffer; int error; - if (in) { - if (get_compat_itimerval(&kin, in)) - return -EFAULT; + if (value) { + error = get_old_itimerval32(&set_buffer, value); + if (error) + return error; } else { - memset(&kin, 0, sizeof(kin)); + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); } - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) return error; - if (put_compat_itimerval(out, &kout)) + if (put_old_itimerval32(ovalue, &get_buffer)) return -EFAULT; return 0; } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 000000000000..12858507d75a --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/time_namespace.h> +#include <linux/user_namespace.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/seq_file.h> +#include <linux/proc_ns.h> +#include <linux/export.h> +#include <linux/time.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <linux/mm.h> + +#include <vdso/datapage.h> + +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* + * Check that @tim value is in [offset, KTIME_MAX + offset] + * and subtract offset. + */ + if (tim < offset) { + /* + * User can specify @tim *absolute* value - if it's lesser than + * the time namespace's offset - it's already expired. + */ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + goto fail_dec; + + kref_init(&ns->kref); + + ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free_page; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; + return ns; + +fail_free_page: + __free_page(ns->vvar_page); +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces + * the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VCLOCK_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(offset_lock); + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_data *vdata; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + mutex_lock(&offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + goto out; + + ns->frozen_offsets = true; + vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_data(&vdata[i], ns); + +out: + mutex_unlock(&offset_lock); +} + +void free_time_ns(struct kref *kref) +{ + struct time_namespace *ns; + + ns = container_of(kref, struct time_namespace, kref); + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + __free_page(ns->vvar_page); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +{ + struct time_namespace *ns = to_time_ns(new); + int err; + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + timens_set_vvar_page(current, ns); + + err = vdso_join_timens(current, ns); + if (err) + return err; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + int err; + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return 0; + + timens_set_vvar_page(tsk, ns); + + err = vdso_join_timens(tsk, ns); + if (err) + return err; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + return 0; +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .kref = KREF_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, + .frozen_offsets = true, +}; + +static int __init time_ns_init(void) +{ + return 0; +} +subsys_initcall(time_ns_init); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 65eb796610dc..069ca78fb0bf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = (time_t)ts->tv_sec; + txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ec960bb939fd..77c0c2370b6d 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -14,8 +14,6 @@ #include "posix-timers.h" -static void delete_clock(struct kref *kref); - /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ @@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = 0; if (!err) { - kref_get(&clk->kref); + get_device(clk->dev); fp->private_data = clk; } out: @@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp) if (clk->ops.release) err = clk->ops.release(clk); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); fp->private_data = NULL; @@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = { #endif }; -int posix_clock_register(struct posix_clock *clk, dev_t devid) +int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; - kref_init(&clk->kref); init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); + clk->dev = dev; - return err; + return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - void posix_clock_unregister(struct posix_clock *clk) { - cdev_del(&clk->cdev); + cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); @@ -315,8 +310,8 @@ out: } const struct k_clock clock_posix_dynamic = { - .clock_getres = pc_clock_getres, - .clock_set = pc_clock_settime, - .clock_get = pc_clock_gettime, - .clock_adj = pc_clock_adjtime, + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get_timespec = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a426f4e3125..8ff6da77a01f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -20,11 +20,20 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer); +void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) +{ + posix_cputimers_init(pct); + if (cpu_limit != RLIM_INFINITY) { + pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; + pct->timers_active = true; + } +} + /* * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->cputime_expires expiration cache if necessary. Needs - * siglock protection since other code may update expiration cache as - * well. + * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if + * necessary. Needs siglock protection since other code may update the + * expiration cache as well. */ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { @@ -35,46 +44,97 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) spin_unlock_irq(&task->sighand->siglock); } -static int check_clock(const clockid_t which_clock) +/* + * Functions for validating access to tasks. + */ +static struct task_struct *lookup_task(const pid_t pid, bool thread, + bool gettime) { - int error = 0; struct task_struct *p; - const pid_t pid = CPUCLOCK_PID(which_clock); - - if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) - return -EINVAL; - if (pid == 0) - return 0; + /* + * If the encoded PID is 0, then the timer is targeted at current + * or the process to which current belongs. + */ + if (!pid) + return thread ? current : current->group_leader; - rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : has_group_leader_pid(p))) { - error = -EINVAL; + if (!p) + return p; + + if (thread) + return same_thread_group(p, current) ? p : NULL; + + if (gettime) { + /* + * For clock_gettime(PROCESS) the task does not need to be + * the actual group leader. tsk->sighand gives + * access to the group's clock. + * + * Timers need the group leader because they take a + * reference on it and store the task pointer until the + * timer is destroyed. + */ + return (p == current || thread_group_leader(p)) ? p : NULL; } + + /* + * For processes require that p is group leader. + */ + return has_group_leader_pid(p) ? p : NULL; +} + +static struct task_struct *__get_task_for_clock(const clockid_t clock, + bool getref, bool gettime) +{ + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t pid = CPUCLOCK_PID(clock); + struct task_struct *p; + + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; + + rcu_read_lock(); + p = lookup_task(pid, thread, gettime); + if (p && getref) + get_task_struct(p); rcu_read_unlock(); + return p; +} - return error; +static inline struct task_struct *get_task_for_clock(const clockid_t clock) +{ + return __get_task_for_clock(clock, true, false); +} + +static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) +{ + return __get_task_for_clock(clock, true, true); +} + +static inline int validate_clock_permissions(const clockid_t clock) +{ + return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; } /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ -static void bump_cpu_timer(struct k_itimer *timer, u64 now) +static u64 bump_cpu_timer(struct k_itimer *timer, u64 now) { + u64 delta, incr, expires = timer->it.cpu.node.expires; int i; - u64 delta, incr; if (!timer->it_interval) - return; + return expires; - if (now < timer->it.cpu.expires) - return; + if (now < expires) + return expires; incr = timer->it_interval; - delta = now + incr - timer->it.cpu.expires; + delta = now + incr - expires; /* Don't use (incr*2 < delta), incr*2 might overflow. */ for (i = 0; incr < delta - incr; i++) @@ -84,48 +144,26 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) if (delta < incr) continue; - timer->it.cpu.expires += incr; + timer->it.cpu.node.expires += incr; timer->it_overrun += 1LL << i; delta -= incr; } + return timer->it.cpu.node.expires; } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) +/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ +static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct) { - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - -static inline u64 prof_ticks(struct task_struct *p) -{ - u64 utime, stime; - - task_cputime(p, &utime, &stime); - - return utime + stime; -} -static inline u64 virt_ticks(struct task_struct *p) -{ - u64 utime, stime; - - task_cputime(p, &utime, &stime); - - return utime; + return !(~pct->bases[CPUCLOCK_PROF].nextevt | + ~pct->bases[CPUCLOCK_VIRT].nextevt | + ~pct->bases[CPUCLOCK_SCHED].nextevt); } static int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { - int error = check_clock(which_clock); + int error = validate_clock_permissions(which_clock); + if (!error) { tp->tv_sec = 0; tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); @@ -142,42 +180,66 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) } static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) +posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) { + int error = validate_clock_permissions(clock); + /* * You can never reset a CPU clock, but we check for other errors * in the call before failing with EPERM. */ - int error = check_clock(which_clock); - if (error == 0) { - error = -EPERM; - } - return error; + return error ? : -EPERM; } - /* - * Sample a per-thread clock for the given task. + * Sample a per-thread clock for the given task. clkid is validated. */ -static int cpu_clock_sample(const clockid_t which_clock, - struct task_struct *p, u64 *sample) +static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) { - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; + u64 utime, stime; + + if (clkid == CPUCLOCK_SCHED) + return task_sched_runtime(p); + + task_cputime(p, &utime, &stime); + + switch (clkid) { case CPUCLOCK_PROF: - *sample = prof_ticks(p); - break; + return utime + stime; case CPUCLOCK_VIRT: - *sample = virt_ticks(p); - break; - case CPUCLOCK_SCHED: - *sample = task_sched_runtime(p); - break; + return utime; + default: + WARN_ON_ONCE(1); } return 0; } +static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime) +{ + samples[CPUCLOCK_PROF] = stime + utime; + samples[CPUCLOCK_VIRT] = utime; + samples[CPUCLOCK_SCHED] = rtime; +} + +static void task_sample_cputime(struct task_struct *p, u64 *samples) +{ + u64 stime, utime; + + task_cputime(p, &utime, &stime); + store_samples(samples, stime, utime, p->se.sum_exec_runtime); +} + +static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, + u64 *samples) +{ + u64 stime, utime, rtime; + + utime = atomic64_read(&at->utime); + stime = atomic64_read(&at->stime); + rtime = atomic64_read(&at->sum_exec_runtime); + store_samples(samples, stime, utime, rtime); +} + /* * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg * to avoid race conditions with concurrent updates to cputime. @@ -193,29 +255,56 @@ retry: } } -static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, + struct task_cputime *sum) { __update_gt_cputime(&cputime_atomic->utime, sum->utime); __update_gt_cputime(&cputime_atomic->stime, sum->stime); __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); } -/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ -static inline void sample_cputime_atomic(struct task_cputime *times, - struct task_cputime_atomic *atomic_times) +/** + * thread_group_sample_cputime - Sample cputime for a given task + * @tsk: Task for which cputime needs to be started + * @samples: Storage for time samples + * + * Called from sys_getitimer() to calculate the expiry time of an active + * timer. That means group cputime accounting is already active. Called + * with task sighand lock held. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) { - times->utime = atomic64_read(&atomic_times->utime); - times->stime = atomic64_read(&atomic_times->stime); - times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + + WARN_ON_ONCE(!pct->timers_active); + + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +/** + * thread_group_start_cputime - Start cputime and return a sample + * @tsk: Task for which cputime needs to be started + * @samples: Storage for time samples + * + * The thread group cputime accouting is avoided when there are no posix + * CPU timers armed. Before starting a timer it's required to check whether + * the time accounting is active. If not, a full update of the atomic + * accounting store needs to be done and the accounting enabled. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - struct task_cputime sum; + struct posix_cputimers *pct = &tsk->signal->posix_cputimers; /* Check if cputimer isn't running. This is accessed without locking. */ - if (!READ_ONCE(cputimer->running)) { + if (!READ_ONCE(pct->timers_active)) { + struct task_cputime sum; + /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have @@ -225,94 +314,69 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) update_gt_cputime(&cputimer->cputime_atomic, &sum); /* - * We're setting cputimer->running without a lock. Ensure - * this only gets written to in one operation. We set - * running after update_gt_cputime() as a small optimization, - * but barriers are not required because update_gt_cputime() + * We're setting timers_active without a lock. Ensure this + * only gets written to in one operation. We set it after + * update_gt_cputime() as a small optimization, but + * barriers are not required because update_gt_cputime() * can handle concurrent updates. */ - WRITE_ONCE(cputimer->running, true); + WRITE_ONCE(pct->timers_active, true); } - sample_cputime_atomic(times, &cputimer->cputime_atomic); + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - u64 *sample) +static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) { - struct task_cputime cputime; + struct task_cputime ct; - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - thread_group_cputime(p, &cputime); - *sample = cputime.utime + cputime.stime; - break; - case CPUCLOCK_VIRT: - thread_group_cputime(p, &cputime); - *sample = cputime.utime; - break; - case CPUCLOCK_SCHED: - thread_group_cputime(p, &cputime); - *sample = cputime.sum_exec_runtime; - break; - } - return 0; + thread_group_cputime(tsk, &ct); + store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime); } -static int posix_cpu_clock_get_task(struct task_struct *tsk, - const clockid_t which_clock, - struct timespec64 *tp) +/* + * Sample a process (thread group) clock for the given task clkid. If the + * group's cputime accounting is already enabled, read the atomic + * store. Otherwise a full update is required. Task's sighand lock must be + * held to protect the task traversal on a full update. clkid is already + * validated. + */ +static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, + bool start) { - int err = -EINVAL; - u64 rtn; + struct thread_group_cputimer *cputimer = &p->signal->cputimer; + struct posix_cputimers *pct = &p->signal->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(tsk, current)) - err = cpu_clock_sample(which_clock, tsk, &rtn); + if (!READ_ONCE(pct->timers_active)) { + if (start) + thread_group_start_cputime(p, samples); + else + __thread_group_cputime(p, samples); } else { - if (tsk == current || thread_group_leader(tsk)) - err = cpu_clock_sample_group(which_clock, tsk, &rtn); + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } - if (!err) - *tp = ns_to_timespec64(rtn); - - return err; + return samples[clkid]; } - -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) +static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) { - const pid_t pid = CPUCLOCK_PID(which_clock); - int err = -EINVAL; + const clockid_t clkid = CPUCLOCK_WHICH(clock); + struct task_struct *tsk; + u64 t; - if (pid == 0) { - /* - * Special case constant value for our own clocks. - * We don't have to do any lookup to find ourselves. - */ - err = posix_cpu_clock_get_task(current, which_clock, tp); - } else { - /* - * Find the given PID, and validate that the caller - * should be able to see it. - */ - struct task_struct *p; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - err = posix_cpu_clock_get_task(p, which_clock, tp); - rcu_read_unlock(); - } + tsk = get_task_for_clock_get(clock); + if (!tsk) + return -EINVAL; - return err; + if (CPUCLOCK_PERTHREAD(clock)) + t = cpu_clock_sample(clkid, tsk); + else + t = cpu_clock_sample_group(clkid, tsk, false); + put_task_struct(tsk); + + *tp = ns_to_timespec64(t); + return 0; } /* @@ -322,44 +386,15 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *t */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - int ret = 0; - const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); - struct task_struct *p; + struct task_struct *p = get_task_for_clock(new_timer->it_clock); - if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + if (!p) return -EINVAL; new_timer->kclock = &clock_posix_cpu; - - INIT_LIST_HEAD(&new_timer->it.cpu.entry); - - rcu_read_lock(); - if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { - if (pid == 0) { - p = current; - } else { - p = find_task_by_vpid(pid); - if (p && !same_thread_group(p, current)) - p = NULL; - } - } else { - if (pid == 0) { - p = current->group_leader; - } else { - p = find_task_by_vpid(pid); - if (p && !has_group_leader_pid(p)) - p = NULL; - } - } + timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.task = p; - if (p) { - get_task_struct(p); - } else { - ret = -EINVAL; - } - rcu_read_unlock(); - - return ret; + return 0; } /* @@ -370,12 +405,14 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) */ static int posix_cpu_timer_del(struct k_itimer *timer) { - int ret = 0; - unsigned long flags; + struct cpu_timer *ctmr = &timer->it.cpu; + struct task_struct *p = ctmr->task; struct sighand_struct *sighand; - struct task_struct *p = timer->it.cpu.task; + unsigned long flags; + int ret = 0; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return -EINVAL; /* * Protect against sighand release/switch in exit/exec and process/ @@ -384,15 +421,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) sighand = lock_task_sighand(p, &flags); if (unlikely(sighand == NULL)) { /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. + * This raced with the reaping of the task. The exit cleanup + * should have removed this timer from the timer queue. */ - WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { if (timer->it.cpu.firing) ret = TIMER_RETRY; else - list_del(&timer->it.cpu.entry); + cpu_timer_dequeue(ctmr); unlock_task_sighand(p, &flags); } @@ -403,25 +440,30 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } -static void cleanup_timers_list(struct list_head *head) +static void cleanup_timerqueue(struct timerqueue_head *head) { - struct cpu_timer_list *timer, *next; + struct timerqueue_node *node; + struct cpu_timer *ctmr; - list_for_each_entry_safe(timer, next, head, entry) - list_del_init(&timer->entry); + while ((node = timerqueue_getnext(head))) { + timerqueue_del(head, node); + ctmr = container_of(node, struct cpu_timer, node); + ctmr->head = NULL; + } } /* - * Clean out CPU timers still ticking when a thread exited. The task - * pointer is cleared, and the expiry time is replaced with the residual - * time for later timer_gettime calls to return. + * Clean out CPU timers which are still armed when a thread exits. The + * timers are only removed from the list. No other updates are done. The + * corresponding posix timers are still accessible, but cannot be rearmed. + * * This must be called with the siglock held. */ -static void cleanup_timers(struct list_head *head) +static void cleanup_timers(struct posix_cputimers *pct) { - cleanup_timers_list(head); - cleanup_timers_list(++head); - cleanup_timers_list(++head); + cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead); + cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead); + cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead); } /* @@ -431,16 +473,11 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - cleanup_timers(tsk->cpu_timers); + cleanup_timers(&tsk->posix_cputimers); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers); -} - -static inline int expires_gt(u64 expires, u64 new_exp) -{ - return expires == 0 || expires > new_exp; + cleanup_timers(&tsk->signal->posix_cputimers); } /* @@ -449,58 +486,33 @@ static inline int expires_gt(u64 expires, u64 new_exp) */ static void arm_timer(struct k_itimer *timer) { - struct task_struct *p = timer->it.cpu.task; - struct list_head *head, *listpos; - struct task_cputime *cputime_expires; - struct cpu_timer_list *const nt = &timer->it.cpu; - struct cpu_timer_list *next; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - head = p->cpu_timers; - cputime_expires = &p->cputime_expires; - } else { - head = p->signal->cpu_timers; - cputime_expires = &p->signal->cputime_expires; - } - head += CPUCLOCK_WHICH(timer->it_clock); - - listpos = head; - list_for_each_entry(next, head, entry) { - if (nt->expires < next->expires) - break; - listpos = &next->entry; - } - list_add(&nt->entry, listpos); - - if (listpos == head) { - u64 exp = nt->expires; + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + struct cpu_timer *ctmr = &timer->it.cpu; + u64 newexp = cpu_timer_getexpires(ctmr); + struct task_struct *p = ctmr->task; + struct posix_cputimer_base *base; + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + base = p->posix_cputimers.bases + clkidx; + else + base = p->signal->posix_cputimers.bases + clkidx; + + if (!cpu_timer_enqueue(&base->tqhead, ctmr)) + return; - /* - * We are the new earliest-expiring POSIX 1.b timer, hence - * need to update expiration cache. Take into account that - * for process timers we share expiration cache with itimers - * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. - */ + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + if (newexp < base->nextevt) + base->nextevt = newexp; - switch (CPUCLOCK_WHICH(timer->it_clock)) { - case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp)) - cputime_expires->prof_exp = exp; - break; - case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp)) - cputime_expires->virt_exp = exp; - break; - case CPUCLOCK_SCHED: - if (expires_gt(cputime_expires->sched_exp, exp)) - cputime_expires->sched_exp = exp; - break; - } - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); - else - tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); - } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } /* @@ -508,24 +520,26 @@ static void arm_timer(struct k_itimer *timer) */ static void cpu_timer_fire(struct k_itimer *timer) { + struct cpu_timer *ctmr = &timer->it.cpu; + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { /* * User don't want any signal. */ - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); } else if (!timer->it_interval) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -539,33 +553,6 @@ static void cpu_timer_fire(struct k_itimer *timer) } /* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, u64 *sample) -{ - struct task_cputime cputime; - - thread_group_cputimer(p, &cputime); - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - *sample = cputime.utime + cputime.stime; - break; - case CPUCLOCK_VIRT: - *sample = cputime.utime; - break; - case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime; - break; - } - return 0; -} - -/* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. * If we return TIMER_RETRY, it's necessary to release the timer's lock @@ -574,13 +561,16 @@ static int cpu_timer_sample_group(const clockid_t which_clock, static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec64 *new, struct itimerspec64 *old) { - unsigned long flags; - struct sighand_struct *sighand; - struct task_struct *p = timer->it.cpu.task; + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); u64 old_expires, new_expires, old_incr, val; - int ret; + struct cpu_timer *ctmr = &timer->it.cpu; + struct task_struct *p = ctmr->task; + struct sighand_struct *sighand; + unsigned long flags; + int ret = 0; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return -EINVAL; /* * Use the to_ktime conversion because that clamps the maximum @@ -597,22 +587,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(sighand == NULL)) { + if (unlikely(sighand == NULL)) return -ESRCH; - } /* * Disarm any old timer after extracting its expiry time. */ - - ret = 0; old_incr = timer->it_interval; - old_expires = timer->it.cpu.expires; + old_expires = cpu_timer_getexpires(ctmr); + if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; ret = TIMER_RETRY; - } else - list_del_init(&timer->it.cpu.entry); + } else { + cpu_timer_dequeue(ctmr); + } /* * We need to sample the current value to convert the new @@ -622,11 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * times (in arm_timer). With an absolute time, we must * check if it's already passed. In short, we need a sample. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &val); - } else { - cpu_timer_sample_group(timer->it_clock, p, &val); - } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + val = cpu_clock_sample(clkid, p); + else + val = cpu_clock_sample_group(clkid, p, true); if (old) { if (old_expires == 0) { @@ -634,18 +622,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, old->it_value.tv_nsec = 0; } else { /* - * Update the timer in case it has - * overrun already. If it has, - * we'll report it as having overrun - * and with the next reloaded timer - * already ticking, though we are - * swallowing that pending - * notification here to install the - * new setting. + * Update the timer in case it has overrun already. + * If it has, we'll report it as having overrun and + * with the next reloaded timer already ticking, + * though we are swallowing that pending + * notification here to install the new setting. */ - bump_cpu_timer(timer, val); - if (val < timer->it.cpu.expires) { - old_expires = timer->it.cpu.expires - val; + u64 exp = bump_cpu_timer(timer, val); + + if (val < exp) { + old_expires = exp - val; old->it_value = ns_to_timespec64(old_expires); } else { old->it_value.tv_nsec = 1; @@ -674,7 +660,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * For a timer with no notification action, we don't actually * arm the timer (we'll just fake it for timer_gettime). */ - timer->it.cpu.expires = new_expires; + cpu_timer_setexpires(ctmr, new_expires); if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -715,24 +701,27 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { - u64 now; - struct task_struct *p = timer->it.cpu.task; + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); + struct cpu_timer *ctmr = &timer->it.cpu; + u64 now, expires = cpu_timer_getexpires(ctmr); + struct task_struct *p = ctmr->task; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return; /* * Easy part: convert the reload time. */ itp->it_interval = ktime_to_timespec64(timer->it_interval); - if (!timer->it.cpu.expires) + if (!expires) return; /* * Sample the clock to take the difference with the expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); + now = cpu_clock_sample(clkid, p); } else { struct sighand_struct *sighand; unsigned long flags; @@ -747,18 +736,18 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp /* * The process has been reaped. * We can't even collect a sample any more. - * Call the timer disarmed, nothing else to do. + * Disarm the timer, nothing else to do. */ - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); return; } else { - cpu_timer_sample_group(timer->it_clock, p, &now); + now = cpu_clock_sample_group(clkid, p, false); unlock_task_sighand(p, &flags); } } - if (now < timer->it.cpu.expires) { - itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); + if (now < expires) { + itp->it_value = ns_to_timespec64(expires - now); } else { /* * The timer should have expired already, but the firing @@ -769,26 +758,42 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp } } -static unsigned long long -check_timers_list(struct list_head *timers, - struct list_head *firing, - unsigned long long curr) -{ - int maxfire = 20; +#define MAX_COLLECTED 20 - while (!list_empty(timers)) { - struct cpu_timer_list *t; +static u64 collect_timerqueue(struct timerqueue_head *head, + struct list_head *firing, u64 now) +{ + struct timerqueue_node *next; + int i = 0; + + while ((next = timerqueue_getnext(head))) { + struct cpu_timer *ctmr; + u64 expires; + + ctmr = container_of(next, struct cpu_timer, node); + expires = cpu_timer_getexpires(ctmr); + /* Limit the number of timers to expire at once */ + if (++i == MAX_COLLECTED || now < expires) + return expires; + + ctmr->firing = 1; + cpu_timer_dequeue(ctmr); + list_add_tail(&ctmr->elist, firing); + } - t = list_first_entry(timers, struct cpu_timer_list, entry); + return U64_MAX; +} - if (!--maxfire || curr < t->expires) - return t->expires; +static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + struct list_head *firing) +{ + struct posix_cputimer_base *base = pct->bases; + int i; - t->firing = 1; - list_move_tail(&t->entry, firing); + for (i = 0; i < CPUCLOCK_MAX; i++, base++) { + base->nextevt = collect_timerqueue(&base->tqhead, firing, + samples[i]); } - - return 0; } static inline void check_dl_overrun(struct task_struct *tsk) @@ -799,6 +804,20 @@ static inline void check_dl_overrun(struct task_struct *tsk) } } +static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) +{ + if (time < limit) + return false; + + if (print_fatal_signals) { + pr_info("%s Watchdog Timeout (%s): %s[%d]\n", + rt ? "RT" : "CPU", hard ? "hard" : "soft", + current->comm, task_pid_nr(current)); + } + __group_send_sig_info(signo, SEND_SIG_PRIV, current); + return true; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -807,76 +826,50 @@ static inline void check_dl_overrun(struct task_struct *tsk) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - struct list_head *timers = tsk->cpu_timers; - struct task_cputime *tsk_expires = &tsk->cputime_expires; - u64 expires; + struct posix_cputimers *pct = &tsk->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; unsigned long soft; if (dl_task(tsk)) check_dl_overrun(tsk); - /* - * If cputime_expires is zero, then there are no active - * per thread CPU timers. - */ - if (task_cputime_zero(&tsk->cputime_expires)) + if (expiry_cache_is_inactive(pct)) return; - expires = check_timers_list(timers, firing, prof_ticks(tsk)); - tsk_expires->prof_exp = expires; - - expires = check_timers_list(++timers, firing, virt_ticks(tsk)); - tsk_expires->virt_exp = expires; - - tsk_expires->sched_exp = check_timers_list(++timers, firing, - tsk->se.sum_exec_runtime); + task_sample_cputime(tsk, samples); + collect_posix_cputimers(pct, samples, firing); /* * Check for the special case thread timers. */ soft = task_rlimit(tsk, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ + unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + /* At the hard limit, send SIGKILL. No further action. */ if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - if (print_fatal_signals) { - pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + check_rlimit(rttime, hard, SIGKILL, true, true)) return; - } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - if (soft < hard) { - soft += USEC_PER_SEC; - tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = - soft; - } - if (print_fatal_signals) { - pr_info("RT Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + + /* At the soft limit, send a SIGXCPU every second */ + if (check_rlimit(rttime, soft, SIGXCPU, true, false)) { + soft += USEC_PER_SEC; + tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft; } } - if (task_cputime_zero(tsk_expires)) + + if (expiry_cache_is_inactive(pct)) tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } static inline void stop_process_timers(struct signal_struct *sig) { - struct thread_group_cputimer *cputimer = &sig->cputimer; + struct posix_cputimers *pct = &sig->posix_cputimers; - /* Turn off cputimer->running. This is done without locking. */ - WRITE_ONCE(cputimer->running, false); + /* Turn off the active flag. This is done without locking. */ + WRITE_ONCE(pct->timers_active, false); tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } @@ -898,7 +891,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } - if (it->expires && (!*expires || it->expires < *expires)) + if (it->expires && it->expires < *expires) *expires = it->expires; } @@ -911,87 +904,69 @@ static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { struct signal_struct *const sig = tsk->signal; - u64 utime, ptime, virt_expires, prof_expires; - u64 sum_sched_runtime, sched_expires; - struct list_head *timers = sig->cpu_timers; - struct task_cputime cputime; + struct posix_cputimers *pct = &sig->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; unsigned long soft; /* - * If cputimer is not running, then there are no active - * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). + * If there are no active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) nothing to check. Also skip the process wide timer + * processing when there is already another task handling them. */ - if (!READ_ONCE(tsk->signal->cputimer.running)) + if (!READ_ONCE(pct->timers_active) || pct->expiry_active) return; - /* + /* * Signify that a thread is checking for process timers. * Write access to this field is protected by the sighand lock. */ - sig->cputimer.checking_timer = true; + pct->expiry_active = true; /* - * Collect the current process totals. + * Collect the current process totals. Group accounting is active + * so the sample can be taken directly. */ - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; - sum_sched_runtime = cputime.sum_exec_runtime; - - prof_expires = check_timers_list(timers, firing, ptime); - virt_expires = check_timers_list(++timers, firing, utime); - sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); + proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); + collect_posix_cputimers(pct, samples, firing); /* * Check for the special case process timers. */ - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, - SIGPROF); - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, - SIGVTALRM); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], + &pct->bases[CPUCLOCK_PROF].nextevt, + samples[CPUCLOCK_PROF], SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], + &pct->bases[CPUCLOCK_VIRT].nextevt, + samples[CPUCLOCK_VIRT], SIGVTALRM); + soft = task_rlimit(tsk, RLIMIT_CPU); if (soft != RLIM_INFINITY) { - unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); + /* RLIMIT_CPU is in seconds. Samples are nanoseconds */ unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); - u64 x; - if (psecs >= hard) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - if (print_fatal_signals) { - pr_info("RT Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + u64 ptime = samples[CPUCLOCK_PROF]; + u64 softns = (u64)soft * NSEC_PER_SEC; + u64 hardns = (u64)hard * NSEC_PER_SEC; + + /* At the hard limit, send SIGKILL. No further action. */ + if (hard != RLIM_INFINITY && + check_rlimit(ptime, hardns, SIGKILL, false, true)) return; + + /* At the soft limit, send a SIGXCPU every second */ + if (check_rlimit(ptime, softns, SIGXCPU, false, false)) { + sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; + softns += NSEC_PER_SEC; } - if (psecs >= soft) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - if (print_fatal_signals) { - pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (soft < hard) { - soft++; - sig->rlim[RLIMIT_CPU].rlim_cur = soft; - } - } - x = soft * NSEC_PER_SEC; - if (!prof_expires || x < prof_expires) - prof_expires = x; + + /* Update the expiry cache */ + if (softns < pct->bases[CPUCLOCK_PROF].nextevt) + pct->bases[CPUCLOCK_PROF].nextevt = softns; } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; - sig->cputime_expires.sched_exp = sched_expires; - if (task_cputime_zero(&sig->cputime_expires)) + if (expiry_cache_is_inactive(pct)) stop_process_timers(sig); - sig->cputimer.checking_timer = false; + pct->expiry_active = false; } /* @@ -1000,18 +975,21 @@ static void check_process_timers(struct task_struct *tsk, */ static void posix_cpu_timer_rearm(struct k_itimer *timer) { + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); + struct cpu_timer *ctmr = &timer->it.cpu; + struct task_struct *p = ctmr->task; struct sighand_struct *sighand; unsigned long flags; - struct task_struct *p = timer->it.cpu.task; u64 now; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return; /* * Fetch the current sample and update the timer's expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); + now = cpu_clock_sample(clkid, p); bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) return; @@ -1031,13 +1009,13 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) * The process has been reaped. * We can't even collect a sample any more. */ - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); return; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* If the process is dying, no need to rearm */ goto unlock; } - cpu_timer_sample_group(timer->it_clock, p, &now); + now = cpu_clock_sample_group(clkid, p, true); bump_cpu_timer(timer, now); /* Leave the sighand locked for the call below. */ } @@ -1051,26 +1029,24 @@ unlock: } /** - * task_cputime_expired - Compare two task_cputime entities. + * task_cputimers_expired - Check whether posix CPU timers are expired * - * @sample: The task_cputime structure to be checked for expiration. - * @expires: Expiration times, against which @sample will be checked. + * @samples: Array of current samples for the CPUCLOCK clocks + * @pct: Pointer to a posix_cputimers container * - * Checks @sample against @expires to see if any field of @sample has expired. - * Returns true if any field of the former is greater than the corresponding - * field of the latter if the latter field is set. Otherwise returns false. + * Returns true if any member of @samples is greater than the corresponding + * member of @pct->bases[CLK].nextevt. False otherwise */ -static inline int task_cputime_expired(const struct task_cputime *sample, - const struct task_cputime *expires) +static inline bool +task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) { - if (expires->utime && sample->utime >= expires->utime) - return 1; - if (expires->stime && sample->utime + sample->stime >= expires->stime) - return 1; - if (expires->sum_exec_runtime != 0 && - sample->sum_exec_runtime >= expires->sum_exec_runtime) - return 1; - return 0; + int i; + + for (i = 0; i < CPUCLOCK_MAX; i++) { + if (samples[i] >= pct->bases[i].nextevt) + return true; + } + return false; } /** @@ -1083,48 +1059,50 @@ static inline int task_cputime_expired(const struct task_cputime *sample, * timers and compare them with the corresponding expiration times. Return * true if a timer has expired, else return false. */ -static inline int fastpath_timer_check(struct task_struct *tsk) +static inline bool fastpath_timer_check(struct task_struct *tsk) { + struct posix_cputimers *pct = &tsk->posix_cputimers; struct signal_struct *sig; - if (!task_cputime_zero(&tsk->cputime_expires)) { - struct task_cputime task_sample; + if (!expiry_cache_is_inactive(pct)) { + u64 samples[CPUCLOCK_MAX]; - task_cputime(tsk, &task_sample.utime, &task_sample.stime); - task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) - return 1; + task_sample_cputime(tsk, samples); + if (task_cputimers_expired(samples, pct)) + return true; } sig = tsk->signal; + pct = &sig->posix_cputimers; /* - * Check if thread group timers expired when the cputimer is - * running and no other thread in the group is already checking - * for thread group cputimers. These fields are read without the - * sighand lock. However, this is fine because this is meant to - * be a fastpath heuristic to determine whether we should try to - * acquire the sighand lock to check/handle timers. + * Check if thread group timers expired when timers are active and + * no other thread in the group is already handling expiry for + * thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to be + * a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to handle timer expiry. * - * In the worst case scenario, if 'running' or 'checking_timer' gets - * set but the current thread doesn't see the change yet, we'll wait - * until the next thread in the group gets a scheduler interrupt to - * handle the timer. This isn't an issue in practice because these - * types of delays with signals actually getting sent are expected. + * In the worst case scenario, if concurrently timers_active is set + * or expiry_active is cleared, but the current thread doesn't see + * the change yet, the timer checks are delayed until the next + * thread in the group gets a scheduler interrupt to handle the + * timer. This isn't an issue in practice because these types of + * delays with signals actually getting sent are expected. */ - if (READ_ONCE(sig->cputimer.running) && - !READ_ONCE(sig->cputimer.checking_timer)) { - struct task_cputime group_sample; + if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) { + u64 samples[CPUCLOCK_MAX]; - sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); + proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, + samples); - if (task_cputime_expired(&group_sample, &sig->cputime_expires)) - return 1; + if (task_cputimers_expired(samples, pct)) + return true; } if (dl_task(tsk) && tsk->dl.dl_overrun) - return 1; + return true; - return 0; + return false; } /* @@ -1132,11 +1110,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void run_posix_cpu_timers(void) { - LIST_HEAD(firing); + struct task_struct *tsk = current; struct k_itimer *timer, *next; unsigned long flags; + LIST_HEAD(firing); lockdep_assert_irqs_disabled(); @@ -1174,11 +1153,11 @@ void run_posix_cpu_timers(struct task_struct *tsk) * each timer's lock before clearing its firing flag, so no * timer call will interfere. */ - list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; spin_lock(&timer->it_lock); - list_del_init(&timer->it.cpu.entry); + list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; /* @@ -1196,16 +1175,18 @@ void run_posix_cpu_timers(struct task_struct *tsk) * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. */ -void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, u64 *newval, u64 *oldval) { - u64 now; - int ret; + u64 now, *nextevt; + + if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) + return; - WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - ret = cpu_timer_sample_group(clock_idx, tsk, &now); + nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt; + now = cpu_clock_sample_group(clkid, tsk, true); - if (oldval && ret != -EINVAL) { + if (oldval) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update @@ -1226,19 +1207,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } /* - * Update expiration cache if we are the earliest timer, or eventually - * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. + * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF + * expiry cache is also used by RLIMIT_CPU!. */ - switch (clock_idx) { - case CPUCLOCK_PROF: - if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) - tsk->signal->cputime_expires.prof_exp = *newval; - break; - case CPUCLOCK_VIRT: - if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) - tsk->signal->cputime_expires.virt_exp = *newval; - break; - } + if (*newval < *nextevt) + *nextevt = *newval; tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } @@ -1260,6 +1233,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_overrun = -1; error = posix_cpu_timer_create(&timer); timer.it_process = current; + if (!error) { static struct itimerspec64 zero_it; struct restart_block *restart; @@ -1275,7 +1249,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires == 0) { + if (!cpu_timer_getexpires(&timer.it.cpu)) { /* * Our timer fired and was reset, below * deletion can not fail. @@ -1297,7 +1271,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - expires = timer.it.cpu.expires; + expires = cpu_timer_getexpires(&timer.it.cpu); error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* @@ -1417,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer) } const struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, - .timer_rearm = posix_cpu_timer_rearm, + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get_timespec = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; const struct k_clock clock_process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, + .clock_getres = process_cpu_clock_getres, + .clock_get_timespec = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, }; const struct k_clock clock_thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, + .clock_getres = thread_cpu_clock_getres, + .clock_get_timespec = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..fcb3b21d8bdc 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -14,6 +14,7 @@ #include <linux/ktime.h> #include <linux/timekeeping.h> #include <linux/posix-timers.h> +#include <linux/time_namespace.h> #include <linux/compat.h> #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER @@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) break; case CLOCK_MONOTONIC: ktime_get_ts64(tp); + timens_add_monotonic(tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); break; default: return -EINVAL; @@ -126,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct __kernel_timespec __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -144,13 +148,19 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); +#endif + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif @@ -212,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -230,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d7f2d91acdac..ff0eb30de346 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,6 +30,7 @@ #include <linux/hashtable.h> #include <linux/compat.h> #include <linux/nospec.h> +#include <linux/time_namespace.h> #include "timekeeping.h" #include "posix-timers.h" @@ -165,12 +166,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; } +static ktime_t posix_get_realtime_ktime(clockid_t which_clock) +{ + return ktime_get_real(); +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) @@ -187,18 +193,25 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); + timens_add_monotonic(tp); return 0; } +static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) +{ + return ktime_get(); +} + /* * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -213,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -222,18 +236,29 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) +{ + return ktime_get_boottime(); +} + +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } +static ktime_t posix_get_tai_ktime(clockid_t which_clock) +{ + return ktime_get_clocktai(); +} + static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; @@ -442,7 +467,7 @@ static struct k_itimer * alloc_posix_timer(void) static void k_itimer_rcu_free(struct rcu_head *head) { - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); kmem_cache_free(posix_timers_cache, tmr); } @@ -459,7 +484,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - call_rcu(&tmr->it.rcu, k_itimer_rcu_free); + call_rcu(&tmr->rcu, k_itimer_rcu_free); } static int common_timer_create(struct k_itimer *new_timer) @@ -645,7 +670,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct timespec64 ts64; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; @@ -663,12 +687,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) return; } - /* - * The timespec64 based conversion is suboptimal, but it's not - * worth to implement yet another callback. - */ - kc->clock_get(timr->it_clock, &ts64); - now = timespec64_to_ktime(ts64); + now = kc->clock_get_ktime(timr->it_clock); /* * When a requeue is pending or this is a SIGEV_NONE timer move the @@ -781,7 +800,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic - * functions which use timr->kclock->clock_get() work. + * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -805,6 +824,35 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr) return hrtimer_try_to_cancel(&timr->it.real.timer); } +static void common_timer_wait_running(struct k_itimer *timer) +{ + hrtimer_cancel_wait_running(&timer->it.real.timer); +} + +/* + * On PREEMPT_RT this prevent priority inversion against softirq kthread in + * case it gets preempted while executing a timer callback. See comments in + * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a + * cpu_relax(). + */ +static struct k_itimer *timer_wait_running(struct k_itimer *timer, + unsigned long *flags) +{ + const struct k_clock *kc = READ_ONCE(timer->kclock); + timer_t timer_id = READ_ONCE(timer->it_id); + + /* Prevent kfree(timer) after dropping the lock */ + rcu_read_lock(); + unlock_timer(timer, *flags); + + if (!WARN_ON_ONCE(!kc->timer_wait_running)) + kc->timer_wait_running(timer); + + rcu_read_unlock(); + /* Relock the timer. It might be not longer hashed. */ + return lock_timer(timer_id, flags); +} + /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, @@ -837,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); + if (flags & TIMER_ABSTIME) + expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); @@ -844,13 +894,13 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int flags, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { const struct k_clock *kc; struct k_itimer *timr; - unsigned long flag; + unsigned long flags; int error = 0; if (!timespec64_valid(&new_spec64->it_interval) || @@ -859,8 +909,9 @@ static int do_timer_settime(timer_t timer_id, int flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); + + timr = lock_timer(timer_id, &flags); retry: - timr = lock_timer(timer_id, &flag); if (!timr) return -EINVAL; @@ -868,13 +919,16 @@ retry: if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, new_spec64, old_spec64); + error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - unlock_timer(timr, flag); if (error == TIMER_RETRY) { - old_spec64 = NULL; // We already got the old time... + // We already got the old time... + old_spec64 = NULL; + /* Unlocks and relocks the timer if it still exists */ + timr = timer_wait_running(timr, &flags); goto retry; } + unlock_timer(timr, flags); return error; } @@ -951,13 +1005,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) struct k_itimer *timer; unsigned long flags; -retry_delete: timer = lock_timer(timer_id, &flags); + +retry_delete: if (!timer) return -EINVAL; - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); + if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { + /* Unlocks and relocks the timer if it still exists */ + timer = timer_wait_running(timer, &flags); goto retry_delete; } @@ -1032,7 +1088,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get_timespec(which_clock, &kernel_tp); if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; @@ -1114,7 +1170,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, if (!kc) return -EINVAL; - err = kc->clock_get(which_clock, &ts); + err = kc->clock_get_timespec(which_clock, &ts); if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; @@ -1165,7 +1221,22 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { - return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +static int common_nsleep_timens(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -1226,7 +1297,8 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, + .clock_get_timespec = posix_get_realtime_timespec, + .clock_get_ktime = posix_get_realtime_ktime, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1238,13 +1310,15 @@ static const struct k_clock clock_realtime = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, + .clock_get_timespec = posix_get_monotonic_timespec, + .clock_get_ktime = posix_get_monotonic_ktime, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1253,27 +1327,29 @@ static const struct k_clock clock_monotonic = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic_raw = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, + .clock_get_timespec = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, + .clock_get_timespec = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, + .clock_get_timespec = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, + .clock_get_ktime = posix_get_tai_ktime, + .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1283,13 +1359,15 @@ static const struct k_clock clock_tai = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, + .clock_get_ktime = posix_get_boottime_ktime, + .clock_get_timespec = posix_get_boottime_timespec, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1298,6 +1376,7 @@ static const struct k_clock clock_boottime = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index de5daa6d975a..f32a2ebba9b8 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -6,8 +6,11 @@ struct k_clock { struct timespec64 *tp); int (*clock_set)(const clockid_t which_clock, const struct timespec64 *tp); - int (*clock_get)(const clockid_t which_clock, - struct timespec64 *tp); + /* Returns the clock value in the current time namespace. */ + int (*clock_get_timespec)(const clockid_t which_clock, + struct timespec64 *tp); + /* Returns the clock value in the root time namespace. */ + ktime_t (*clock_get_ktime)(const clockid_t which_clock); int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, @@ -24,6 +27,7 @@ struct k_clock { int (*timer_try_to_cancel)(struct k_itimer *timr); void (*timer_arm)(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none); + void (*timer_wait_running)(struct k_itimer *timr); }; extern const struct k_clock clock_posix_cpu; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 142b07619918..e4332e3e2d56 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -17,6 +17,8 @@ #include <linux/seqlock.h> #include <linux/bitops.h> +#include "timekeeping.h" + /** * struct clock_read_data - data required to read from sched_clock() * @@ -167,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - unsigned long r; + unsigned long r, flags; char r_unit; struct clock_read_data rd; if (cd.rate > rate) return; - WARN_ON(!irqs_disabled()); + /* Cannot register a sched_clock with interrupts on */ + local_irq_save(flags); /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); @@ -231,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); + local_irq_restore(flags); + pr_debug("Registered %pS as sched_clock source\n", read); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 5be6154e2fd2..b5a65e212df2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -42,34 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt) */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { - int bc_moved; /* - * We try to cancel the timer first. If the callback is on - * flight on some other cpu then we let it handle it. If we - * were able to cancel the timer nothing can rearm it as we - * own broadcast_lock. + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. * - * However we can also be called from the event handler of - * ce_broadcast_hrtimer itself when it expires. We cannot - * restart the timer because we are in the callback, but we - * can set the expiry time and let the callback return - * HRTIMER_RESTART. + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. * - * Since we are in the idle loop at this point and because - * hrtimer_{start/cancel} functions call into tracing, - * calls to these functions must be bound within RCU_NONIDLE. + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. */ - RCU_NONIDLE({ - bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) - hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED);}); - if (bc_moved) { - /* Bind the "device" to the cpu */ - bc->bound_on = smp_processor_id(); - } else if (bc->bound_on == smp_processor_id()) { - hrtimer_set_expires(&bctimer, expires); - } + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); return 0; } @@ -95,16 +100,12 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event != KTIME_MAX) - return HRTIMER_RESTART; - return HRTIMER_NORESTART; } void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); bctimer.function = bc_handler; clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 59225b484e4e..7e5d3524e924 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -11,6 +11,7 @@ #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> +#include <linux/nmi.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> @@ -558,6 +559,7 @@ void tick_unfreeze(void) trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { + touch_softlockup_watchdog(); tick_resume_local(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index be9707f68024..a792d21cac64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Do a quick check without holding jiffies_lock: + * The READ_ONCE() pairs with two updates done later in this function. */ - delta = ktime_sub(now, last_jiffies_update); + delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); if (delta < tick_period) return; @@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add(last_jiffies_update, tick_period)); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { @@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now) ticks = ktime_divns(delta, incr); - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add_ns(last_jiffies_update, + incr * ticks)); } do_timer(++ticks); @@ -172,6 +176,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; bool tick_nohz_full_running; +EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) @@ -198,6 +203,11 @@ static bool check_tick_dependency(atomic_t *dep) return true; } + if (val & TICK_DEP_MASK_RCU) { + trace_tick_stop(0, TICK_DEP_MASK_RCU); + return true; + } + return false; } @@ -324,6 +334,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) preempt_enable(); } } +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { @@ -331,6 +342,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) atomic_andnot(BIT(bit), &ts->tick_dep_mask); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. Posix CPU timers need this in order to elapse @@ -344,11 +356,13 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) */ tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse @@ -397,6 +411,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } +EXPORT_SYMBOL_GPL(tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { @@ -634,10 +649,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, tick_period); - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - else + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED_HARD); + } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + } /* * Reset to make sure next tick stop doesn't get fooled by past @@ -802,7 +819,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ts->sched_timer, tick, + HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); @@ -1116,7 +1134,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. Update process times would miss the @@ -1230,7 +1248,7 @@ static void tick_nohz_switch_to_nohz(void) * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ next = tick_init_jiffy_update(); @@ -1327,7 +1345,7 @@ void tick_setup_sched_timer(void) /* * Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ @@ -1342,7 +1360,7 @@ void tick_setup_sched_timer(void) } hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..cdd7386115ff 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz); * why not move it into the appropriate arch directory (for those * architectures that need it). */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) +SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { - time_t i = (time_t)ktime_get_real_seconds(); + __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) * architectures that need it). */ -SYSCALL_DEFINE1(stime, time_t __user *, tptr) +SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; @@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME32 */ #endif -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { @@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return error; if (tz) { - /* Verify we're witin the +-15 hrs range */ + /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; @@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return 0; } -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) @@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (compat_get_timeval(&user_tv, tv)) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) @@ -267,7 +265,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ @@ -550,18 +548,21 @@ EXPORT_SYMBOL(set_normalized_timespec64); */ struct timespec64 ns_to_timespec64(const s64 nsec) { - struct timespec64 ts; + struct timespec64 ts = { 0, 0 }; s32 rem; - if (!nsec) - return (struct timespec64) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; + if (likely(nsec > 0)) { + ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + ts.tv_nsec = rem; + } else if (nsec < 0) { + /* + * With negative times, tv_sec points to the earlier + * second, and tv_nsec counts the nanoseconds since + * then, so tv_nsec is always a positive number. + */ + ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; + ts.tv_nsec = NSEC_PER_SEC - rem - 1; } - ts.tv_nsec = rem; return ts; } @@ -625,10 +626,12 @@ EXPORT_SYMBOL(__usecs_to_jiffies); * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -static unsigned long -__timespec64_to_jiffies(u64 sec, long nsec) + +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) { - nsec = nsec + TICK_NSEC - 1; + u64 sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -639,18 +642,6 @@ __timespec64_to_jiffies(u64 sec, long nsec) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } - -static unsigned long -__timespec_to_jiffies(unsigned long sec, long nsec) -{ - return __timespec64_to_jiffies((u64)sec, nsec); -} - -unsigned long -timespec64_to_jiffies(const struct timespec64 *value) -{ - return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); -} EXPORT_SYMBOL(timespec64_to_jiffies); void @@ -668,44 +659,6 @@ jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) EXPORT_SYMBOL(jiffies_to_timespec64); /* - * We could use a similar algorithm to timespec_to_jiffies (with a - * different multiplier for usec instead of nsec). But this has a - * problem with rounding: we can't exactly add TICK_NSEC - 1 to the - * usec value, since it's not necessarily integral. - * - * We could instead round in the intermediate scaled representation - * (i.e. in units of 1/2^(large scale) jiffies) but that's also - * perilous: the scaling introduces a small positive error, which - * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 - * units to the intermediate before shifting) leads to accidental - * overflow and overestimates. - * - * At the cost of one additional multiplication by a constant, just - * use the timespec implementation. - */ -unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - return __timespec_to_jiffies(value->tv_sec, - value->tv_usec * NSEC_PER_USEC); -} -EXPORT_SYMBOL(timeval_to_jiffies); - -void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} -EXPORT_SYMBOL(jiffies_to_timeval); - -/* * Convert jiffies/jiffies_64 to clock_t and back. */ clock_t jiffies_to_clock_t(unsigned long x) @@ -880,10 +833,11 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; - /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + /* Zero out the padding in compat mode */ + if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; + /* In 32-bit mode, this drops the padding */ ts->tv_nsec = kts.tv_nsec; return 0; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d911c8470149..ca69290bee2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Timespec representation for VDSO update to avoid 64bit division + * on every update. + */ + tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 343c7ba33b1c..4820823515e9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -196,6 +196,10 @@ EXPORT_SYMBOL(jiffies_64); struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; +#ifdef CONFIG_PREEMPT_RT + spinlock_t expiry_lock; + atomic_t timer_waiters; +#endif unsigned long clk; unsigned long next_expiry; unsigned int cpu; @@ -1227,7 +1231,78 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); -#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT +static __init void timer_base_init_expiry_lock(struct timer_base *base) +{ + spin_lock_init(&base->expiry_lock); +} + +static inline void timer_base_lock_expiry(struct timer_base *base) +{ + spin_lock(&base->expiry_lock); +} + +static inline void timer_base_unlock_expiry(struct timer_base *base) +{ + spin_unlock(&base->expiry_lock); +} + +/* + * The counterpart to del_timer_wait_running(). + * + * If there is a waiter for base->expiry_lock, then it was waiting for the + * timer callback to finish. Drop expiry_lock and reaquire it. That allows + * the waiter to acquire the lock and make progress. + */ +static void timer_sync_wait_running(struct timer_base *base) +{ + if (atomic_read(&base->timer_waiters)) { + spin_unlock(&base->expiry_lock); + spin_lock(&base->expiry_lock); + } +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion, if the softirq thread on a remote CPU + * got preempted, and it prevents a life lock when the task which tries to + * delete a timer preempted the softirq thread running the timer callback + * function. + */ +static void del_timer_wait_running(struct timer_list *timer) +{ + u32 tf; + + tf = READ_ONCE(timer->flags); + if (!(tf & TIMER_MIGRATING)) { + struct timer_base *base = get_timer_base(tf); + + /* + * Mark the base as contended and grab the expiry lock, + * which is held by the softirq across the timer + * callback. Drop the lock immediately so the softirq can + * expire the next timer. In theory the timer could already + * be running again, but that's more than unlikely and just + * causes another wait loop. + */ + atomic_inc(&base->timer_waiters); + spin_lock_bh(&base->expiry_lock); + atomic_dec(&base->timer_waiters); + spin_unlock_bh(&base->expiry_lock); + } +} +#else +static inline void timer_base_init_expiry_lock(struct timer_base *base) { } +static inline void timer_base_lock_expiry(struct timer_base *base) { } +static inline void timer_base_unlock_expiry(struct timer_base *base) { } +static inline void timer_sync_wait_running(struct timer_base *base) { } +static inline void del_timer_wait_running(struct timer_list *timer) { } +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1266,6 +1341,8 @@ EXPORT_SYMBOL(try_to_del_timer_sync); */ int del_timer_sync(struct timer_list *timer) { + int ret; + #ifdef CONFIG_LOCKDEP unsigned long flags; @@ -1283,12 +1360,17 @@ int del_timer_sync(struct timer_list *timer) * could lead to deadlock. */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - cpu_relax(); - } + + do { + ret = try_to_del_timer_sync(timer); + + if (unlikely(ret < 0)) { + del_timer_wait_running(timer); + cpu_relax(); + } + } while (ret < 0); + + return ret; } EXPORT_SYMBOL(del_timer_sync); #endif @@ -1360,10 +1442,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); + base->running_timer = NULL; raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); + base->running_timer = NULL; + timer_sync_wait_running(base); raw_spin_lock_irq(&base->lock); } } @@ -1593,24 +1678,26 @@ void timer_clear_idle(void) static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { + unsigned long now = READ_ONCE(jiffies); + /* * NOHZ optimization. After a long idle sleep we need to forward the * base to current jiffies. Avoid a loop by searching the bitfield for * the next expiring timer. */ - if ((long)(jiffies - base->clk) > 2) { + if ((long)(now - base->clk) > 2) { unsigned long next = __next_timer_interrupt(base); /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, jiffies)) { + if (time_after(next, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. */ - base->clk = jiffies; + base->clk = now; return 0; } base->clk = next; @@ -1643,7 +1730,7 @@ void update_process_times(int user_tick) #endif scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) - run_posix_cpu_timers(p); + run_posix_cpu_timers(); } /** @@ -1658,6 +1745,7 @@ static inline void __run_timers(struct timer_base *base) if (!time_after_eq(jiffies, base->clk)) return; + timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); /* @@ -1684,8 +1772,8 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } - base->running_timer = NULL; raw_spin_unlock_irq(&base->lock); + timer_base_unlock_expiry(base); } /* @@ -1930,6 +2018,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + timer_base_init_expiry_lock(base); } } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8cf3596a4ce6..9577c89179cd 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; - u64 nsec; + u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; @@ -28,11 +28,6 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; - /* CLOCK_REALTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; - vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec; - /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; @@ -45,33 +40,31 @@ static inline void update_vdso_data(struct vdso_data *vdata, } vdso_ts->nsec = nsec; - /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; - vdso_ts->sec = tk->raw_sec; - vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* Copy MONOTONIC time for BOOTTIME */ + sec = vdso_ts->sec; + /* Add the boot offset */ + sec += tk->monotonic_to_boot.tv_sec; + nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; - vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + - ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + vdso_ts->sec = sec; + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; - - /* - * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. - */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); } void update_vsyscall(struct timekeeper *tk) @@ -80,20 +73,17 @@ void update_vsyscall(struct timekeeper *tk) struct vdso_timestamp *vdso_ts; u64 nsec; - if (__arch_update_vdso_data()) { - /* - * Some architectures might want to skip the update of the - * data page. - */ - return; - } - /* copy vsyscall data */ vdso_write_begin(vdata); vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + /* CLOCK_REALTIME also required for time() */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; @@ -106,7 +96,17 @@ void update_vsyscall(struct timekeeper *tk) nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); - if (__arch_use_vsyscall(vdata)) + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + + /* + * Architectures can opt out of updating the high resolution part + * of the VDSO. + */ + if (__arch_update_vdso_data()) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); @@ -120,10 +120,8 @@ void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - if (__arch_use_vsyscall(vdata)) { - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; - } + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } |