diff options
Diffstat (limited to 'kernel')
37 files changed, 4401 insertions, 931 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 14f4d45e0ae9..ac6b27abb1ad 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/fork.c b/kernel/fork.c index 0b6293d94d96..d154cc786489 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; sig->tsk = tsk; diff --git a/kernel/futex.c b/kernel/futex.c index 5a737de857d3..e749e7df14b1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (sec != MAX_SCHEDULE_TIMEOUT) { to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); to->timer.expires = ktime_set(sec, nsec); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f44e499e8fca..476cb0c0b4a4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1,8 +1,9 @@ /* * linux/kernel/hrtimer.c * - * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> - * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * @@ -31,12 +32,17 @@ */ #include <linux/cpu.h> +#include <linux/irq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> +#include <linux/kallsyms.h> #include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/seq_file.h> +#include <linux/err.h> #include <asm/uaccess.h> @@ -45,7 +51,7 @@ * * returns the time in ktime_t format */ -static ktime_t ktime_get(void) +ktime_t ktime_get(void) { struct timespec now; @@ -59,7 +65,7 @@ static ktime_t ktime_get(void) * * returns the time in ktime_t format */ -static ktime_t ktime_get_real(void) +ktime_t ktime_get_real(void) { struct timespec now; @@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); * This ensures that we capture erroneous accesses to these clock ids * rather than moving them into the range of valid clock id's. */ - -#define MAX_HRTIMER_BASES 2 - -static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + + .clock_base = { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_REALTIME_RES, - }, - { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_MONOTONIC_RES, - }, + { + .index = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, + { + .index = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + } }; /** @@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ -static void hrtimer_get_softirq_time(struct hrtimer_base *base) +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; + struct timespec xts; unsigned long seq; do { seq = read_seqbegin(&xtime_lock); - xtim = timespec_to_ktime(xtime); - tomono = timespec_to_ktime(wall_to_monotonic); - +#ifdef CONFIG_NO_HZ + getnstimeofday(&xts); +#else + xts = xtime; +#endif } while (read_seqretry(&xtime_lock, seq)); - base[CLOCK_REALTIME].softirq_time = xtim; - base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); + xtim = timespec_to_ktime(xts); + tomono = timespec_to_ktime(wall_to_monotonic); + base->clock_base[CLOCK_REALTIME].softirq_time = xtim; + base->clock_base[CLOCK_MONOTONIC].softirq_time = + ktime_add(xtim, tomono); +} + +/* + * Helper function to check, whether the timer is running the callback + * function + */ +static inline int hrtimer_callback_running(struct hrtimer *timer) +{ + return timer->state & HRTIMER_STATE_CALLBACK; } /* @@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) */ #ifdef CONFIG_SMP -#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) * possible to set timer->base = NULL and drop the lock: the timer remains * locked. */ -static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->lock, *flags); + spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, /* * Switch the timer base to the current CPU when possible. */ -static inline struct hrtimer_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_base *new_base; + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; - new_base = &__get_cpu_var(hrtimer_bases)[base->index]; + new_cpu_base = &__get_cpu_var(hrtimer_bases); + new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { /* @@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ - if (unlikely(base->curr_timer == timer)) + if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->lock); - spin_lock(&new_base->lock); + spin_unlock(&base->cpu_base->lock); + spin_lock(&new_base->cpu_base->lock); timer->base = new_base; } return new_base; @@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) #else /* CONFIG_SMP */ -#define set_curr_timer(b, t) do { } while (0) - -static inline struct hrtimer_base * +static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - struct hrtimer_base *base = timer->base; + struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } -#define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b) (b) #endif /* !CONFIG_SMP */ @@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) return ktime_add(kt, tmp); } - -#else /* CONFIG_KTIME_SCALAR */ - # endif /* !CONFIG_KTIME_SCALAR */ /* * Divide a ktime value by a nanosecond value */ -static unsigned long ktime_divns(const ktime_t kt, s64 div) +unsigned long ktime_divns(const ktime_t kt, s64 div) { u64 dclc, inc, dns; int sft = 0; @@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div) return (unsigned long) dclc; } - -#else /* BITS_PER_LONG < 64 */ -# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) #endif /* BITS_PER_LONG >= 64 */ +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static int hrtimer_hres_enabled __read_mostly = 1; + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + if (!strcmp(str, "off")) + hrtimer_hres_enabled = 0; + else if (!strcmp(str, "on")) + hrtimer_hres_enabled = 1; + else + return 0; + return 1; +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(void) +{ + return __get_cpu_var(hrtimer_bases).hres_active; +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires; + + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + timer = rb_entry(base->first, struct hrtimer, node); + expires = ktime_sub(timer->expires, base->offset); + if (expires.tv64 < cpu_base->expires_next.tv64) + cpu_base->expires_next = expires; + } + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + tick_program_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + ktime_t expires = ktime_sub(timer->expires, base->offset); + int res; + + /* + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interupt context. The + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; + + if (expires.tv64 >= expires_next->tv64) + return 0; + + /* + * Clockevents returns -ETIME, when the event was in the past. + */ + res = tick_program_event(expires, 0); + if (!IS_ERR_VALUE(res)) + *expires_next = expires; + return res; +} + + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base; + struct timespec realtime_offset; + unsigned long seq; + + if (!hrtimer_hres_active()) + return; + + do { + seq = read_seqbegin(&xtime_lock); + set_normalized_timespec(&realtime_offset, + -wall_to_monotonic.tv_sec, + -wall_to_monotonic.tv_nsec); + } while (read_seqretry(&xtime_lock, seq)); + + base = &__get_cpu_var(hrtimer_bases); + + /* Adjust CLOCK_REALTIME offset */ + spin_lock(&base->lock); + base->clock_base[CLOCK_REALTIME].offset = + timespec_to_ktime(realtime_offset); + + hrtimer_force_reprogram(base); + spin_unlock(&base->lock); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 0, 1); +} + +/* + * Check, whether the timer is on the callback pending list + */ +static inline int hrtimer_cb_pending(const struct hrtimer *timer) +{ + return timer->state & HRTIMER_STATE_PENDING; +} + +/* + * Remove a timer from the callback pending list + */ +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) +{ + list_del_init(&timer->cb_entry); +} + +/* + * Initialize the high resolution related parts of cpu_base + */ +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + base->hres_active = 0; + INIT_LIST_HEAD(&base->cb_pending); +} + +/* + * Initialize the high resolution related parts of a hrtimer + */ +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) +{ + INIT_LIST_HEAD(&timer->cb_entry); +} + +/* + * When High resolution timers are active, try to reprogram. Note, that in case + * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry + * check happens. The timer gets enqueued into the rbtree. The reprogramming + * and expiry check is done in the hrtimer_interrupt or in the softirq. + */ +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { + + /* Timer is expired, act upon the callback mode */ + switch(timer->cb_mode) { + case HRTIMER_CB_IRQSAFE_NO_RESTART: + /* + * We can call the callback from here. No restart + * happens, so no danger of recursion + */ + BUG_ON(timer->function(timer) != HRTIMER_NORESTART); + return 1; + case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + /* + * This is solely for the sched tick emulation with + * dynamic tick support to ensure that we do not + * restart the tick right on the edge and end up with + * the tick timer in the softirq ! The calling site + * takes care of this. + */ + return 1; + case HRTIMER_CB_IRQSAFE: + case HRTIMER_CB_SOFTIRQ: + /* + * Move everything else into the softirq pending list ! + */ + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + timer->state = HRTIMER_STATE_PENDING; + raise_softirq(HRTIMER_SOFTIRQ); + return 1; + default: + BUG(); + } + } + return 0; +} + +/* + * Switch to high resolution mode + */ +static void hrtimer_switch_to_hres(void) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + + if (base->hres_active) + return; + + local_irq_save(flags); + + if (tick_init_highres()) { + local_irq_restore(flags); + return; + } + base->hres_active = 1; + base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", + smp_processor_id()); +} + +#else + +static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } +static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} +static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /* * Counterpart to lock_timer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->lock, *flags); + spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. */ -static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, int reprogram) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) + rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + /* + * Reprogram the clock event device. When the timer is already + * expired hrtimer_enqueue_reprogram has either called the + * callback or added it to the pending list and raised the + * softirq. + * + * This is a NOP for !HIGHRES + */ + if (reprogram && hrtimer_enqueue_reprogram(timer, base)) + return; + base->first = &timer->node; + } + + rb_link_node(&timer->node, parent, link); + rb_insert_color(&timer->node, &base->active); + /* + * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the + * state of a possibly running callback. + */ + timer->state |= HRTIMER_STATE_ENQUEUED; } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) */ -static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + unsigned long newstate, int reprogram) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) - base->first = rb_next(&timer->node); - rb_erase(&timer->node, &base->active); - rb_set_parent(&timer->node, &timer->node); + /* High res. callback list. NOP for !HIGHRES */ + if (hrtimer_cb_pending(timer)) + hrtimer_remove_cb_pending(timer); + else { + /* + * Remove the timer from the rbtree and replace the + * first entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) + hrtimer_force_reprogram(base->cpu_base); + } + rb_erase(&timer->node, &base->active); + } + timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - if (hrtimer_active(timer)) { - __remove_hrtimer(timer, base); + if (hrtimer_is_queued(timer)) { + int reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + timer_stats_hrtimer_clear_start_info(timer); + reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, + reprogram); return 1; } return 0; @@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - struct hrtimer_base *base, *new_base; + struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret; @@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base); - if (mode == HRTIMER_REL) { + if (mode == HRTIMER_MODE_REL) { tim = ktime_add(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures @@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) } timer->expires = tim; - enqueue_hrtimer(timer, new_base); + timer_stats_hrtimer_set_start_info(timer); + + enqueue_hrtimer(timer, new_base, base == new_base); unlock_hrtimer_base(timer, &flags); @@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); */ int hrtimer_try_to_cancel(struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; base = lock_hrtimer_base(timer, &flags); - if (base->curr_timer != timer) + if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base); unlock_hrtimer_base(timer, &flags); @@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; base = lock_hrtimer_base(timer, &flags); - rem = ktime_sub(timer->expires, timer->base->get_time()); + rem = ktime_sub(timer->expires, base->get_time()); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /** * hrtimer_get_next_event - get the time until next expiry event * @@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { - struct hrtimer *timer; + spin_lock_irqsave(&cpu_base->lock, flags); - spin_lock_irqsave(&base->lock, flags); - if (!base->first) { - spin_unlock_irqrestore(&base->lock, flags); - continue; + if (!hrtimer_hres_active()) { + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + + timer = rb_entry(base->first, struct hrtimer, node); + delta.tv64 = timer->expires.tv64; + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; } - timer = rb_entry(base->first, struct hrtimer, node); - delta.tv64 = timer->expires.tv64; - spin_unlock_irqrestore(&base->lock, flags); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; } + + spin_unlock_irqrestore(&cpu_base->lock, flags); + if (mindelta.tv64 < 0) mindelta.tv64 = 0; return mindelta; @@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void) void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; memset(timer, 0, sizeof(struct hrtimer)); - bases = __raw_get_cpu_var(hrtimer_bases); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &bases[clock_id]; - rb_set_parent(&timer->node, &timer->node); + timer->base = &cpu_base->clock_base[clock_id]; + hrtimer_init_timer_hres(timer); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; - bases = __raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(bases[which_clock].resolution); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + ktime_t expires_next, now; + int i, raise = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + retry: + now = ktime_get(); + + expires_next.tv64 = KTIME_MAX; + + base = cpu_base->clock_base; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + ktime_t basenow; + struct rb_node *node; + + spin_lock(&cpu_base->lock); + + basenow = ktime_add(now, base->offset); + + while ((node = base->first)) { + struct hrtimer *timer; + + timer = rb_entry(node, struct hrtimer, node); + + if (basenow.tv64 < timer->expires.tv64) { + ktime_t expires; + + expires = ktime_sub(timer->expires, + base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + + /* Move softirq callbacks to the pending list */ + if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + raise = 1; + continue; + } + + __remove_hrtimer(timer, base, + HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + + /* + * Note: We clear the CALLBACK bit after + * enqueue_hrtimer to avoid reprogramming of + * the event hardware. This happens at the end + * of this function anyway. + */ + if (timer->function(timer) != HRTIMER_NORESTART) { + BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base, 0); + } + timer->state &= ~HRTIMER_STATE_CALLBACK; + } + spin_unlock(&cpu_base->lock); + base++; + } + + cpu_base->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (expires_next.tv64 != KTIME_MAX) { + if (tick_program_event(expires_next, 0)) + goto retry; + } + + /* Raise softirq ? */ + if (raise) + raise_softirq(HRTIMER_SOFTIRQ); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + spin_lock_irq(&cpu_base->lock); + + while (!list_empty(&cpu_base->cb_pending)) { + enum hrtimer_restart (*fn)(struct hrtimer *); + struct hrtimer *timer; + int restart; + + timer = list_entry(cpu_base->cb_pending.next, + struct hrtimer, cb_entry); + + timer_stats_account_hrtimer(timer); + + fn = timer->function; + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(&cpu_base->lock); + + restart = fn(timer); + + spin_lock_irq(&cpu_base->lock); + + timer->state &= ~HRTIMER_STATE_CALLBACK; + if (restart == HRTIMER_RESTART) { + BUG_ON(hrtimer_active(timer)); + /* + * Enqueue the timer, allow reprogramming of the event + * device + */ + enqueue_hrtimer(timer, timer->base, 1); + } else if (hrtimer_active(timer)) { + /* + * If the timer was rearmed on another CPU, reprogram + * the event device. + */ + if (timer->base->first == &timer->node) + hrtimer_reprogram(timer, timer->base); + } + } + spin_unlock_irq(&cpu_base->lock); +} + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Expire the per base hrtimer-queue: */ -static inline void run_hrtimer_queue(struct hrtimer_base *base) +static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, + int index) { struct rb_node *node; + struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; if (!base->first) return; @@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) if (base->get_softirq_time) base->softirq_time = base->get_softirq_time(); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; - int (*fn)(struct hrtimer *); + enum hrtimer_restart (*fn)(struct hrtimer *); int restart; timer = rb_entry(node, struct hrtimer, node); if (base->softirq_time.tv64 <= timer->expires.tv64) break; + timer_stats_account_hrtimer(timer); + fn = timer->function; - set_curr_timer(base, timer); - __remove_hrtimer(timer, base); - spin_unlock_irq(&base->lock); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(&cpu_base->lock); restart = fn(timer); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); + timer->state &= ~HRTIMER_STATE_CALLBACK; if (restart != HRTIMER_NORESTART) { BUG_ON(hrtimer_active(timer)); - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, 0); } } - set_curr_timer(base, NULL); - spin_unlock_irq(&base->lock); + spin_unlock_irq(&cpu_base->lock); } /* * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. */ void hrtimer_run_queues(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); int i; - hrtimer_get_softirq_time(base); + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); - for (i = 0; i < MAX_HRTIMER_BASES; i++) - run_hrtimer_queue(&base[i]); + hrtimer_get_softirq_time(cpu_base); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + run_hrtimer_queue(cpu_base, i); } /* * Sleep related functions: */ -static int hrtimer_wakeup(struct hrtimer *timer) +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); @@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; sl->task = task; +#ifdef CONFIG_HIGH_RES_TIMERS + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; +#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod set_current_state(TASK_INTERRUPTIBLE); hrtimer_start(&t->timer, t->timer.expires, mode); - schedule(); + if (likely(t->task)) + schedule(); hrtimer_cancel(&t->timer); - mode = HRTIMER_ABS; + mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); @@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; - hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); + hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; - if (do_nanosleep(&t, HRTIMER_ABS)) + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; rmtp = (struct timespec __user *) restart->arg1; @@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, return 0; /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_ABS) + if (mode == HRTIMER_MODE_ABS) return -ERESTARTNOHAND; if (rmtp) { @@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) if (!timespec_valid(&tu)) return -EINVAL; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* @@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) */ static void __devinit init_hrtimers_cpu(int cpu) { - struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { - spin_lock_init(&base->lock); - lockdep_set_class(&base->lock, &base->lock_key); - } + spin_lock_init(&cpu_base->lock); + lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + cpu_base->clock_base[i].cpu_base = cpu_base; + + hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_base *old_base, - struct hrtimer_base *new_base) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); - __remove_hrtimer(timer, old_base); + BUG_ON(hrtimer_callback_running(timer)); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); timer->base = new_base; - enqueue_hrtimer(timer, new_base); + /* + * Enqueue the timer. Allow reprogramming of the event device + */ + enqueue_hrtimer(timer, new_base, 1); } } static void migrate_hrtimers(int cpu) { - struct hrtimer_base *old_base, *new_base; + struct hrtimer_cpu_base *old_base, *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(hrtimer_bases, cpu); - new_base = get_cpu_var(hrtimer_bases); - - local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, cpu); + new_base = &get_cpu_var(hrtimer_bases); - for (i = 0; i < MAX_HRTIMER_BASES; i++) { + tick_cancel_sched_timer(cpu); - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - - BUG_ON(old_base->curr_timer); + local_irq_disable(); - migrate_hrtimer_list(old_base, new_base); + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - old_base++; - new_base++; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); } + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); @@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); migrate_hrtimers(cpu); break; #endif @@ -868,5 +1405,8 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); +#endif } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 475e8a71bcdc..0133f4f9e9f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data); /** * set_irq_data - set irq type data for an irq * @irq: Interrupt number - * @data: Pointer to interrupt specific data + * @entry: Pointer to MSI descriptor data * * Set the hardware irq controller data for an irq */ @@ -230,10 +230,6 @@ static void default_enable(unsigned int irq) */ static void default_disable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; - - if (!(desc->status & IRQ_DELAYED_DISABLE)) - desc->chip->mask(irq); } /* @@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (desc->chip->mask) + desc->chip->mask(irq); + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->status |= IRQ_PENDING; goto out_unlock; + } + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); desc->status |= IRQ_INPROGRESS; spin_unlock(&desc->lock); @@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) /* * If its disabled or no action available - * keep it masked and get out of here + * then mask it and get out of here: */ action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; + if (desc->chip->mask) + desc->chip->mask(irq); goto out; } @@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { - desc->chip->mask(irq); - desc->chip->ack(irq); - } + if (desc->chip != &no_irq_chip) + mask_ack_irq(desc, irq); desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7c85d69188ef..5597c157442a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq) } EXPORT_SYMBOL(synchronize_irq); +/** + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check + * + */ +int irq_can_set_affinity(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || + !desc->chip->set_affinity) + return 0; + + return 1; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +{ + struct irq_desc *desc = irq_desc + irq; + + if (!desc->chip->set_affinity) + return -EINVAL; + + set_balance_irq_affinity(irq, cpumask); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + set_pending_irq(irq, cpumask); +#else + desc->affinity = cpumask; + desc->chip->set_affinity(irq, cpumask); +#endif + return 0; +} + #endif /** @@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (new->flags & IRQF_PERCPU) desc->status |= IRQ_PER_CPU; #endif + /* Exclude IRQ from balancing */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + if (!shared) { irq_chip_set_defaults(desc->chip); @@ -461,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, /* * Lockdep wants atomic interrupt handlers: */ - irqflags |= SA_INTERRUPT; + irqflags |= IRQF_DISABLED; #endif /* * Sanity-check: shared interrupts must pass in a real dev-ID, diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6d3be06e8ce6..2db91eb54ad8 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -#ifdef CONFIG_GENERIC_PENDING_IRQ -void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) -{ - set_balance_irq_affinity(irq, mask_val); - - /* - * Save these away for later use. Re-progam when the - * interrupt is pending - */ - set_pending_irq(irq, mask_val); -} -#else -void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) -{ - set_balance_irq_affinity(irq, mask_val); - irq_desc[irq].affinity = mask_val; - irq_desc[irq].chip->set_affinity(irq, mask_val); -} -#endif - static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, cpumask_t new_value, tmp; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || - CHECK_IRQ_PER_CPU(irq_desc[irq].status)) + irq_balancing_disabled(irq)) return -EIO; err = cpumask_parse_user(buffer, count, new_value); @@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, code to set default SMP affinity. */ return select_smp_affinity(irq) ? -EINVAL : full_count; - proc_set_irq_affinity(irq, new_value); + irq_set_affinity(irq, new_value); return full_count; } diff --git a/kernel/itimer.c b/kernel/itimer.c index 204ed7939e75..307c6a632ef6 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) /* * The timer is automagically restarted, when interval != 0 */ -int it_real_fn(struct hrtimer *timer) +enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); - if (sig->it_real_incr.tv64 != 0) { - hrtimer_forward(timer, timer->base->softirq_time, - sig->it_real_incr); - return HRTIMER_RESTART; - } return HRTIMER_NORESTART; } @@ -231,11 +226,14 @@ again: spin_unlock_irq(&tsk->sighand->siglock); goto again; } - tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) - hrtimer_start(timer, expires, HRTIMER_REL); + if (expires.tv64 != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr.tv64 = 0; + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379aa31ca..796276141e51 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -217,7 +217,10 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + if (sub_info->wait < 0) + kfree(sub_info); + else + complete(sub_info->complete); return 0; } @@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); + if (wait < 0) + return; + if (pid < 0) { sub_info->retval = pid; complete(sub_info->complete); @@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) * @envp: null-terminated environment list * @session_keyring: session keyring for process (NULL for an empty keyring) * @wait: wait for the application to finish and return status. + * when -1 don't wait at all, but you get no useful error back when + * the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. @@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, struct key *session_keyring, int wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .ring = session_keyring, - .wait = wait, - .retval = 0, - }; + struct subprocess_info *sub_info; + int retval; if (!khelper_wq) return -EBUSY; @@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, if (path[0] == '\0') return 0; - queue_work(khelper_wq, &sub_info.work); + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + return -ENOMEM; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->complete = &done; + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + sub_info->ring = session_keyring; + sub_info->wait = wait; + + queue_work(khelper_wq, &sub_info->work); + if (wait < 0) /* task has freed sub_info */ + return 0; wait_for_completion(&done); - return sub_info.retval; + retval = sub_info->retval; + kfree(sub_info); + return retval; } EXPORT_SYMBOL(call_usermodehelper_keys); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 88fc611b3ae9..58f35e586ee3 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -10,7 +10,6 @@ * Code for /proc/lockdep and /proc/lockdep_stats: * */ -#include <linux/sched.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 841539d72c55..d17436cdea1b 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -13,7 +13,6 @@ * Released under the General Public License (GPL). */ #include <linux/mutex.h> -#include <linux/sched.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/poison.h> diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 7c3e1e6dfb5b..657f77697415 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * should be able to see it. */ struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (p) { if (CPUCLOCK_PERTHREAD(which_clock)) { @@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) error = cpu_clock_sample(which_clock, p, &rtn); } - } else if (p->tgid == pid && p->signal) { - error = cpu_clock_sample_group(which_clock, - p, &rtn); + } else { + read_lock(&tasklist_lock); + if (p->tgid == pid && p->signal) { + error = + cpu_clock_sample_group(which_clock, + p, &rtn); + } + read_unlock(&tasklist_lock); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (error) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a1bf61617839..44318ca71978 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); static int common_timer_del(struct k_itimer *timer); -static int posix_timer_fn(struct hrtimer *data); +static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); @@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event); * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. */ -static int posix_timer_fn(struct hrtimer *timer) +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { struct k_itimer *timr; unsigned long flags; int si_private = 0; - int ret = HRTIMER_NORESTART; + enum hrtimer_restart ret = HRTIMER_NORESTART; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); @@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer) if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += hrtimer_forward(timer, - timer->base->softirq_time, + hrtimer_cb_get_time(timer), timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; @@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags, if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; + mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; @@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_REL) + if (mode == HRTIMER_MODE_REL) timer->expires = ktime_add(timer->expires, timer->base->get_time()); return 0; @@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_ABS : HRTIMER_REL, which_clock); + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } asmlinkage long diff --git a/kernel/resource.c b/kernel/resource.c index 2a3f88636580..bdb55a33f969 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -8,7 +8,6 @@ */ #include <linux/module.h> -#include <linux/sched.h> #include <linux/errno.h> #include <linux/ioport.h> #include <linux/init.h> diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4ab17da46fd8..180978cb2f75 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_ABS); + HRTIMER_MODE_ABS); for (;;) { /* Try to acquire the lock: */ diff --git a/kernel/sched.c b/kernel/sched.c index 08f86178aa34..0dc757246d89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_enter_lazy_cpu_mode(); + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); diff --git a/kernel/signal.c b/kernel/signal.c index 8072e568bbe0..e2a7d4bf7d57 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { int signr = __dequeue_signal(&tsk->pending, mask, info); - if (!signr) + if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + /* + * itimer signal ? + * + * itimers are process shared and we restart periodic + * itimers in the signal delivery path to prevent DoS + * attacks in the high resolution timer case. This is + * compliant with the old way of self restarting + * itimers, as the SIGALRM is a legacy signal and only + * queued once. Changing the restart behaviour to + * restart the timer in the signal dequeue path is + * reducing the timer noise on heavy loaded !highres + * systems too. + */ + if (unlikely(signr == SIGALRM)) { + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && + tsk->signal->it_real_incr.tv64 != 0) { + hrtimer_forward(tmr, tmr->base->get_time(), + tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } + } + } recalc_sigpending_tsk(tsk); - if (signr && unlikely(sig_kernel_stop(signr))) { - /* - * Set a marker that we have dequeued a stop signal. Our - * caller might release the siglock and then the pending - * stop signal it is about to process is no longer in the - * pending bitmasks, but must still be cleared by a SIGCONT - * (and overruled by a SIGKILL). So those cases clear this - * shared flag after we've set it. Note that this flag may - * remain set after the signal we return is ignored or - * handled. That doesn't matter because its only purpose - * is to alert stop-signal processing code when another - * processor has come along and cleared the flag. - */ - if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; - } + if (signr && unlikely(sig_kernel_stop(signr))) { + /* + * Set a marker that we have dequeued a stop signal. Our + * caller might release the siglock and then the pending + * stop signal it is about to process is no longer in the + * pending bitmasks, but must still be cleared by a SIGCONT + * (and overruled by a SIGKILL). So those cases clear this + * shared flag after we've set it. Note that this flag may + * remain set after the signal we return is ignored or + * handled. That doesn't matter because its only purpose + * is to alert stop-signal processing code when another + * processor has come along and cleared the flag. + */ + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + } if ( signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ diff --git a/kernel/softirq.c b/kernel/softirq.c index 918e52df090e..8b75008e2bd8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/rcupdate.h> #include <linux/smp.h> +#include <linux/tick.h> #include <asm/irq.h> /* @@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq); #endif +/* + * Enter an interrupt context. + */ +void irq_enter(void) +{ + __irq_enter(); +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) + tick_nohz_update_jiffies(); +#endif +} + #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED # define invoke_softirq() __do_softirq() #else @@ -289,6 +302,12 @@ void irq_exit(void) sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + +#ifdef CONFIG_NO_HZ + /* Make sure that timer wheel updates are propagated */ + if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) + tick_nohz_stop_sched_tick(); +#endif preempt_enable_no_resched(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e0ac6cd79fcf..3ca1d5ff0319 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -90,12 +90,6 @@ extern char modprobe_path[]; #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -#ifdef CONFIG_SYSVIPC -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif #ifdef __sparc__ extern char reboot_command []; @@ -135,18 +129,6 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *); #endif -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); - -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen); - -#ifdef CONFIG_SYSVIPC -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen); -#endif #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, @@ -177,60 +159,6 @@ int sysctl_legacy_va_layout; #endif -static void *get_uts(ctl_table *table, int write) -{ - char *which = table->data; -#ifdef CONFIG_UTS_NS - struct uts_namespace *uts_ns = current->nsproxy->uts_ns; - which = (which - (char *)&init_uts_ns) + (char *)uts_ns; -#endif - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); - return which; -} - -static void put_uts(ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - -#ifdef CONFIG_SYSVIPC -static void *get_ipc(ctl_table *table, int write) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} -#else -#define get_ipc(T,W) ((T)->data) -#endif - -/* /proc declarations: */ - -#ifdef CONFIG_PROC_SYSCTL - -static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); -static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); -static int proc_opensys(struct inode *, struct file *); - -const struct file_operations proc_sys_file_operations = { - .open = proc_opensys, - .read = proc_readsys, - .write = proc_writesys, -}; - -extern struct proc_dir_entry *proc_sys_root; - -static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); -static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); -#endif - /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -278,51 +206,6 @@ static ctl_table root_table[] = { static ctl_table kern_table[] = { { - .ctl_name = KERN_OSTYPE, - .procname = "ostype", - .data = init_uts_ns.name.sysname, - .maxlen = sizeof(init_uts_ns.name.sysname), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_OSRELEASE, - .procname = "osrelease", - .data = init_uts_ns.name.release, - .maxlen = sizeof(init_uts_ns.name.release), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_VERSION, - .procname = "version", - .data = init_uts_ns.name.version, - .maxlen = sizeof(init_uts_ns.name.version), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_NODENAME, - .procname = "hostname", - .data = init_uts_ns.name.nodename, - .maxlen = sizeof(init_uts_ns.name.nodename), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_DOMAINNAME, - .procname = "domainname", - .data = init_uts_ns.name.domainname, - .maxlen = sizeof(init_uts_ns.name.domainname), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, @@ -478,71 +361,6 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_SYSVIPC - { - .ctl_name = KERN_SHMMAX, - .procname = "shmmax", - .data = &init_ipc_ns.shm_ctlmax, - .maxlen = sizeof (init_ipc_ns.shm_ctlmax), - .mode = 0644, - .proc_handler = &proc_ipc_doulongvec_minmax, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_SHMALL, - .procname = "shmall", - .data = &init_ipc_ns.shm_ctlall, - .maxlen = sizeof (init_ipc_ns.shm_ctlall), - .mode = 0644, - .proc_handler = &proc_ipc_doulongvec_minmax, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_SHMMNI, - .procname = "shmmni", - .data = &init_ipc_ns.shm_ctlmni, - .maxlen = sizeof (init_ipc_ns.shm_ctlmni), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_MSGMAX, - .procname = "msgmax", - .data = &init_ipc_ns.msg_ctlmax, - .maxlen = sizeof (init_ipc_ns.msg_ctlmax), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_MSGMNI, - .procname = "msgmni", - .data = &init_ipc_ns.msg_ctlmni, - .maxlen = sizeof (init_ipc_ns.msg_ctlmni), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_MSGMNB, - .procname = "msgmnb", - .data = &init_ipc_ns.msg_ctlmnb, - .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_SEM, - .procname = "sem", - .data = &init_ipc_ns.sem_ctls, - .maxlen = 4*sizeof (int), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, -#endif #ifdef CONFIG_MAGIC_SYSRQ { .ctl_name = KERN_SYSRQ, @@ -1043,6 +861,12 @@ static ctl_table vm_table[] = { { .ctl_name = 0 } }; +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) +static ctl_table binfmt_misc_table[] = { + { .ctl_name = 0 } +}; +#endif + static ctl_table fs_table[] = { { .ctl_name = FS_NRINODE, @@ -1166,6 +990,14 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "binfmt_misc", + .mode = 0555, + .child = binfmt_misc_table, + }, +#endif { .ctl_name = 0 } }; @@ -1177,8 +1009,6 @@ static ctl_table dev_table[] = { { .ctl_name = 0 } }; -extern void init_irq_proc (void); - static DEFINE_SPINLOCK(sysctl_lock); /* called under sysctl_lock */ @@ -1220,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p) list_del_init(&p->ctl_entry); } -void __init sysctl_init(void) +void sysctl_head_finish(struct ctl_table_header *head) { -#ifdef CONFIG_PROC_SYSCTL - register_proc_table(root_table, proc_sys_root, &root_table_header); - init_irq_proc(); -#endif + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) +{ + struct ctl_table_header *head; + struct list_head *tmp; + spin_lock(&sysctl_lock); + if (prev) { + tmp = &prev->ctl_entry; + unuse_table(prev); + goto next; + } + tmp = &root_table_header.ctl_entry; + for (;;) { + head = list_entry(tmp, struct ctl_table_header, ctl_entry); + + if (!use_table(head)) + goto next; + spin_unlock(&sysctl_lock); + return head; + next: + tmp = tmp->next; + if (tmp == &root_table_header.ctl_entry) + break; + } + spin_unlock(&sysctl_lock); + return NULL; } #ifdef CONFIG_SYSCTL_SYSCALL int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - struct list_head *tmp; + struct ctl_table_header *head; int error = -ENOTDIR; if (nlen <= 0 || nlen >= CTL_MAXNAME) @@ -1242,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol if (!oldlenp || get_user(old_len, oldlenp)) return -EFAULT; } - spin_lock(&sysctl_lock); - tmp = &root_table_header.ctl_entry; - do { - struct ctl_table_header *head = - list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - continue; - - spin_unlock(&sysctl_lock); + for (head = sysctl_head_next(NULL); head; + head = sysctl_head_next(head)) { error = parse_table(name, nlen, oldval, oldlenp, newval, newlen, head->ctl_table); - - spin_lock(&sysctl_lock); - unuse_table(head); - if (error != -ENOTDIR) + if (error != -ENOTDIR) { + sysctl_head_finish(head); break; - } while ((tmp = tmp->next) != &root_table_header.ctl_entry); - spin_unlock(&sysctl_lock); + } + } return error; } @@ -1282,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) #endif /* CONFIG_SYSCTL_SYSCALL */ /* - * ctl_perm does NOT grant the superuser all rights automatically, because + * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ @@ -1297,7 +1145,7 @@ static int test_perm(int mode, int op) return -EACCES; } -static inline int ctl_perm(ctl_table *table, int op) +int sysctl_perm(ctl_table *table, int op) { int error; error = security_sysctl(table, op); @@ -1321,19 +1169,11 @@ repeat: for ( ; table->ctl_name || table->procname; table++) { if (!table->ctl_name) continue; - if (n == table->ctl_name || table->ctl_name == CTL_ANY) { + if (n == table->ctl_name) { int error; if (table->child) { - if (ctl_perm(table, 001)) + if (sysctl_perm(table, 001)) return -EPERM; - if (table->strategy) { - error = table->strategy( - table, name, nlen, - oldval, oldlenp, - newval, newlen); - if (error) - return error; - } name++; nlen--; table = table->child; @@ -1361,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table, op |= 004; if (newval) op |= 002; - if (ctl_perm(table, op)) + if (sysctl_perm(table, op)) return -EPERM; if (table->strategy) { @@ -1400,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table, } #endif /* CONFIG_SYSCTL_SYSCALL */ +static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) +{ + for (; table->ctl_name || table->procname; table++) { + table->parent = parent; + if (table->child) + sysctl_set_parent(table, table->child); + } +} + +static __init int sysctl_init(void) +{ + sysctl_set_parent(NULL, root_table); + return 0; +} + +core_initcall(sysctl_init); + /** * register_sysctl_table - register a sysctl hierarchy * @table: the top-level table structure - * @insert_at_head: whether the entry should be inserted in front or at the end * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. An entry with a ctl_name of 0 terminates the table. @@ -1469,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table, * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ -struct ctl_table_header *register_sysctl_table(ctl_table * table, - int insert_at_head) +struct ctl_table_header *register_sysctl_table(ctl_table * table) { struct ctl_table_header *tmp; tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); @@ -1480,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, INIT_LIST_HEAD(&tmp->ctl_entry); tmp->used = 0; tmp->unregistering = NULL; + sysctl_set_parent(NULL, table); spin_lock(&sysctl_lock); - if (insert_at_head) - list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); - else - list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); spin_unlock(&sysctl_lock); -#ifdef CONFIG_PROC_SYSCTL - register_proc_table(table, proc_sys_root, tmp); -#endif return tmp; } @@ -1504,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header) might_sleep(); spin_lock(&sysctl_lock); start_unregistering(header); -#ifdef CONFIG_PROC_SYSCTL - unregister_proc_table(header->ctl_table, proc_sys_root); -#endif spin_unlock(&sysctl_lock); kfree(header); } @@ -1530,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table) #ifdef CONFIG_PROC_SYSCTL -/* Scan the sysctl entries in table and add them all into /proc */ -static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) -{ - struct proc_dir_entry *de; - int len; - mode_t mode; - - for (; table->ctl_name || table->procname; table++) { - /* Can't do anything without a proc name. */ - if (!table->procname) - continue; - /* Maybe we can't do anything with it... */ - if (!table->proc_handler && !table->child) { - printk(KERN_WARNING "SYSCTL: Can't register %s\n", - table->procname); - continue; - } - - len = strlen(table->procname); - mode = table->mode; - - de = NULL; - if (table->proc_handler) - mode |= S_IFREG; - else { - mode |= S_IFDIR; - for (de = root->subdir; de; de = de->next) { - if (proc_match(len, table->procname, de)) - break; - } - /* If the subdir exists already, de is non-NULL */ - } - - if (!de) { - de = create_proc_entry(table->procname, mode, root); - if (!de) - continue; - de->set = set; - de->data = (void *) table; - if (table->proc_handler) - de->proc_fops = &proc_sys_file_operations; - } - table->de = de; - if (de->mode & S_IFDIR) - register_proc_table(table->child, de, set); - } -} - -/* - * Unregister a /proc sysctl table and any subdirectories. - */ -static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) -{ - struct proc_dir_entry *de; - for (; table->ctl_name || table->procname; table++) { - if (!(de = table->de)) - continue; - if (de->mode & S_IFDIR) { - if (!table->child) { - printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); - continue; - } - unregister_proc_table(table->child, de); - - /* Don't unregister directories which still have entries.. */ - if (de->subdir) - continue; - } - - /* - * In any case, mark the entry as goner; we'll keep it - * around if it's busy, but we'll know to do nothing with - * its fields. We are under sysctl_lock here. - */ - de->data = NULL; - - /* Don't unregister proc entries that are still being used.. */ - if (atomic_read(&de->count)) - continue; - - table->de = NULL; - remove_proc_entry(table->procname, root); - } -} - -static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - int op; - struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); - struct ctl_table *table; - size_t res; - ssize_t error = -ENOTDIR; - - spin_lock(&sysctl_lock); - if (de && de->data && use_table(de->set)) { - /* - * at that point we know that sysctl was not unregistered - * and won't be until we finish - */ - spin_unlock(&sysctl_lock); - table = (struct ctl_table *) de->data; - if (!table || !table->proc_handler) - goto out; - error = -EPERM; - op = (write ? 002 : 004); - if (ctl_perm(table, op)) - goto out; - - /* careful: calling conventions are nasty here */ - res = count; - error = (*table->proc_handler)(table, write, file, - buf, &res, ppos); - if (!error) - error = res; - out: - spin_lock(&sysctl_lock); - unuse_table(de->set); - } - spin_unlock(&sysctl_lock); - return error; -} - -static int proc_opensys(struct inode *inode, struct file *file) -{ - if (file->f_mode & FMODE_WRITE) { - /* - * sysctl entries that are not writable, - * are _NOT_ writable, capabilities or not. - */ - if (!(inode->i_mode & S_IWUSR)) - return -EPERM; - } - - return 0; -} - -static ssize_t proc_readsys(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - return do_rw_proc(0, file, buf, count, ppos); -} - -static ssize_t proc_writesys(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - return do_rw_proc(1, file, (char __user *) buf, count, ppos); -} - static int _proc_do_string(void* data, int maxlen, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1762,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, buffer, lenp, ppos); } -/* - * Special case of dostring for the UTS structure. This has locks - * to observe. Should this be in kernel/sys.c ???? - */ - -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int r; - void *which; - which = get_uts(table, write); - r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); - put_uts(table, write, which); - return r; -} static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -2362,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } -#ifdef CONFIG_SYSVIPC -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - void *which; - which = get_ipc(table, write); - return __do_proc_dointvec(which, table, write, filp, buffer, - lenp, ppos, NULL, NULL); -} - -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) -{ - void *which; - which = get_ipc(table, write); - return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, - lenp, ppos, 1l, 1l); -} - -#endif - static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2413,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -#ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} -#endif - int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2648,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, } -/* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table uts_table; - int r, write; - write = newval && newlen; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); - put_uts(table, write, uts_table.data); - return r; -} - -#ifdef CONFIG_SYSVIPC -/* The generic sysctl ipc data routine. */ -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - size_t len; - void *data; - - /* Get out of I don't have a variable */ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - data = get_ipc(table, 1); - if (!data) - return -ENOTDIR; - - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, data, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - if (newval && newlen) { - if (newlen > table->maxlen) - newlen = table->maxlen; - - if (copy_from_user(data, newval, newlen)) - return -EFAULT; - } - return 1; -} -#endif #else /* CONFIG_SYSCTL_SYSCALL */ @@ -2769,20 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return -ENOSYS; } -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} -#ifdef CONFIG_SYSVIPC -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} -#endif #endif /* CONFIG_SYSCTL_SYSCALL */ /* diff --git a/kernel/time.c b/kernel/time.c index 0e017bff4c19..c6c80ea5d0ea 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec) return tv; } +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldnt overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + long tv_usec; + + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); + tv_usec /= NSEC_PER_USEC; + value->tv_usec = tv_usec; +} + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + return x / (HZ / USER_HZ); +#else + u64 tmp = (u64)x * TICK_NSEC; + do_div(tmp, (NSEC_PER_SEC / USER_HZ)); + return (long)tmp; +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + u64 jif; + + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + jif = x * (u64) HZ; + do_div(jif, USER_HZ); + return jif; +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + do_div(x, HZ / USER_HZ); +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x *= TICK_NSEC; + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} + +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#elif (USER_HZ % 512) == 0 + x *= USER_HZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / + USER_HZ)); +#endif + return x; +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 000000000000..f66351126544 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,25 @@ +# +# Timer subsystem related configuration options +# +config TICK_ONESHOT + bool + default n + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 61a3907d16fb..93bccba1f265 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1,8 @@ -obj-y += ntp.o clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o timer_list.o + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o +obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 000000000000..67932ea78c17 --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,345 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/sysdev.h> + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); + +/* Notification for clock events */ +static RAW_NOTIFIER_HEAD(clockevents_chain); + +/* Protection for the above */ +static DEFINE_SPINLOCK(clockevents_lock); + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event_device *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < 1000) + clc = 1000; + if (clc > LONG_MAX) + clc = LONG_MAX; + + return (unsigned long) clc; +} + +/** + * clockevents_set_mode - set the operating mode of a clock event device + * @dev: device to modify + * @mode: new mode + * + * Must be called with interrupts disabled ! + */ +void clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode) +{ + if (dev->mode != mode) { + dev->set_mode(mode, dev); + dev->mode = mode; + } +} + +/** + * clockevents_program_event - Reprogram the clock event device. + * @expires: absolute expiry time (monotonic clock) + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + ktime_t now) +{ + unsigned long long clc; + int64_t delta; + + delta = ktime_to_ns(ktime_sub(expires, now)); + + if (delta <= 0) + return -ETIME; + + dev->next_event = expires; + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + if (delta > dev->max_delta_ns) + delta = dev->max_delta_ns; + if (delta < dev->min_delta_ns) + delta = dev->min_delta_ns; + + clc = delta * dev->mult; + clc >>= dev->shift; + + return dev->set_next_event((unsigned long) clc, dev); +} + +/** + * clockevents_register_notifier - register a clock events change listener + */ +int clockevents_register_notifier(struct notifier_block *nb) +{ + int ret; + + spin_lock(&clockevents_lock); + ret = raw_notifier_chain_register(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); + + return ret; +} + +/** + * clockevents_unregister_notifier - unregister a clock events change listener + */ +void clockevents_unregister_notifier(struct notifier_block *nb) +{ + spin_lock(&clockevents_lock); + raw_notifier_chain_unregister(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); +} + +/* + * Notify about a clock event change. Called with clockevents_lock + * held. + */ +static void clockevents_do_notify(unsigned long reason, void *dev) +{ + raw_notifier_call_chain(&clockevents_chain, reason, dev); +} + +/* + * Called after a notify add to make devices availble which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + } +} + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + + spin_lock(&clockevents_lock); + + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/* + * Noop handler when we shut down an event device + */ +static void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from the notifier chain. clockevents_lock is held already + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + unsigned long flags; + + local_irq_save(flags); + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + old->event_handler = clockevents_handle_noop; + clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + } + local_irq_restore(flags); +} + +/** + * clockevents_request_device + */ +struct clock_event_device *clockevents_request_device(unsigned int features, + cpumask_t cpumask) +{ + struct clock_event_device *cur, *dev = NULL; + struct list_head *tmp; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + cur = list_entry(tmp, struct clock_event_device, list); + + if ((cur->features & features) == features && + cpus_equal(cpumask, cur->cpumask)) { + if (!dev || dev->rating < cur->rating) + dev = cur; + } + } + + clockevents_exchange_device(NULL, dev); + + spin_unlock(&clockevents_lock); + + return dev; +} + +/** + * clockevents_release_device + */ +void clockevents_release_device(struct clock_event_device *dev) +{ + spin_lock(&clockevents_lock); + + clockevents_exchange_device(dev, NULL); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/** + * clockevents_notify - notification about relevant events + */ +void clockevents_notify(unsigned long reason, void *arg) +{ + spin_lock(&clockevents_lock); + clockevents_do_notify(reason, arg); + + switch (reason) { + case CLOCK_EVT_NOTIFY_CPU_DEAD: + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + while (!list_empty(&clockevents_released)) { + struct clock_event_device *dev; + + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + } + break; + default: + break; + } + spin_unlock(&clockevents_lock); +} +EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS + +/** + * clockevents_show_registered - sysfs interface for listing clockevents + * @dev: unused + * @buf: char buffer to be filled with clock events list + * + * Provides sysfs interface for listing registered clock event devices + */ +static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *p = buf; + int cpu; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + struct clock_event_device *ce; + + ce = list_entry(tmp, struct clock_event_device, list); + p += sprintf(p, "%-20s F:%04x M:%d", ce->name, + ce->features, ce->mode); + p += sprintf(p, " C:"); + if (!cpus_equal(ce->cpumask, cpu_possible_map)) { + for_each_cpu_mask(cpu, ce->cpumask) + p += sprintf(p, " %d", cpu); + } else { + /* + * FIXME: Add the cpu which is handling this sucker + */ + } + p += sprintf(p, "\n"); + } + + spin_unlock(&clockevents_lock); + + return p - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(registered, 0600, + clockevents_show_registered, NULL); + +static struct sysdev_class clockevents_sysclass = { + set_kset_name("clockevents"), +}; + +static struct sys_device clockevents_sys_device = { + .id = 0, + .cls = &clockevents_sysclass, +}; + +static int __init clockevents_sysfs_init(void) +{ + int error = sysdev_class_register(&clockevents_sysclass); + + if (!error) + error = sysdev_register(&clockevents_sys_device); + if (!error) + error = sysdev_create_file( + &clockevents_sys_device, + &attr_registered); + return error; +} +device_initcall(clockevents_sysfs_init); +#endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9ef176c4e09..193a0793af95 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -29,6 +29,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> /* XXX - Would like a better way for initializing curr_clocksource */ extern struct clocksource clocksource_jiffies; @@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies; */ static struct clocksource *curr_clocksource = &clocksource_jiffies; static struct clocksource *next_clocksource; +static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void) finished_booting = 1; return 0; } - late_initcall(clocksource_done_booting); +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DEFINE_SPINLOCK(watchdog_lock); +static cycle_t watchdog_last; +/* + * Interval: 0.5sec Treshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) + +static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +{ + if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) + return; + + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + clocksource_change_rating(cs, 0); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + list_del(&cs->wd_list); +} + +static void clocksource_watchdog(unsigned long data) +{ + struct clocksource *cs, *tmp; + cycle_t csnow, wdnow; + int64_t wd_nsec, cs_nsec; + + spin_lock(&watchdog_lock); + + wdnow = watchdog->read(); + wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + watchdog_last = wdnow; + + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + csnow = cs->read(); + /* Initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as + * highres-capable, notify the rest of the + * system as well so that we transition + * into high-res mode: + */ + tick_clock_notify(); + } + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = csnow; + } else { + cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs->wd_last = csnow; + /* Check the delta. Might remove from the list ! */ + clocksource_ratewd(cs, cs_nsec - wd_nsec); + } + } + + if (!list_empty(&watchdog_list)) { + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); + } + spin_unlock(&watchdog_lock); +} +static void clocksource_check_watchdog(struct clocksource *cs) +{ + struct clocksource *cse; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + int started = !list_empty(&watchdog_list); + + list_add(&cs->wd_list, &watchdog_list); + if (!started && watchdog) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + if (!watchdog || cs->rating > watchdog->rating) { + if (watchdog) + del_timer(&watchdog_timer); + watchdog = cs; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + + /* Reset watchdog cycles */ + list_for_each_entry(cse, &watchdog_list, wd_list) + cse->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Start if list is not empty */ + if (!list_empty(&watchdog_list)) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = + jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} +#else +static void clocksource_check_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} +#endif + /** * clocksource_get_next - Returns the selected clocksource * @@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Finds the best registered clocksource. + * select_clocksource - Selects the best registered clocksource. * * Private function. Must hold clocksource_lock when called. * - * Looks through the list of registered clocksources, returning - * the one with the highest rating value. If there is a clocksource - * name that matches the override string, it returns that clocksource. + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ static struct clocksource *select_clocksource(void) { - struct clocksource *best = NULL; - struct list_head *tmp; + struct clocksource *next; - list_for_each(tmp, &clocksource_list) { - struct clocksource *src; + if (list_empty(&clocksource_list)) + return NULL; - src = list_entry(tmp, struct clocksource, list); - if (!best) - best = src; - - /* check for override: */ - if (strlen(src->name) == strlen(override_name) && - !strcmp(src->name, override_name)) { - best = src; - break; - } - /* pick the highest rating: */ - if (src->rating > best->rating) - best = src; - } + if (clocksource_override) + next = clocksource_override; + else + next = list_entry(clocksource_list.next, struct clocksource, + list); + + if (next == curr_clocksource) + return NULL; - return best; + return next; } -/** - * is_registered_source - Checks if clocksource is registered - * @c: pointer to a clocksource - * - * Private helper function. Must hold clocksource_lock when called. - * - * Returns one if the clocksource is already registered, zero otherwise. +/* + * Enqueue the clocksource sorted by rating */ -static int is_registered_source(struct clocksource *c) +static int clocksource_enqueue(struct clocksource *c) { - int len = strlen(c->name); - struct list_head *tmp; + struct list_head *tmp, *entry = &clocksource_list; list_for_each(tmp, &clocksource_list) { - struct clocksource *src; - - src = list_entry(tmp, struct clocksource, list); - if (strlen(src->name) == len && !strcmp(src->name, c->name)) - return 1; + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (cs == c) + return -EBUSY; + /* Keep track of the place, where to insert */ + if (cs->rating >= c->rating) + entry = tmp; } + list_add(&c->list, entry); + + if (strlen(c->name) == strlen(override_name) && + !strcmp(c->name, override_name)) + clocksource_override = c; return 0; } @@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c) */ int clocksource_register(struct clocksource *c) { - int ret = 0; unsigned long flags; + int ret; spin_lock_irqsave(&clocksource_lock, flags); - /* check if clocksource is already registered */ - if (is_registered_source(c)) { - printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); - ret = -EBUSY; - } else { - /* register it */ - list_add(&c->list, &clocksource_list); - /* scan the registered clocksources, and pick the best one */ + ret = clocksource_enqueue(c); + if (!ret) next_clocksource = select_clocksource(); - } spin_unlock_irqrestore(&clocksource_lock, flags); + if (!ret) + clocksource_check_watchdog(c); return ret; } EXPORT_SYMBOL(clocksource_register); /** - * clocksource_reselect - Rescan list for next clocksource + * clocksource_change_rating - Change the rating of a registered clocksource * - * A quick helper function to be used if a clocksource changes its - * rating. Forces the clocksource list to be re-scanned for the best - * clocksource. */ -void clocksource_reselect(void) +void clocksource_change_rating(struct clocksource *cs, int rating) { unsigned long flags; spin_lock_irqsave(&clocksource_lock, flags); + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); next_clocksource = select_clocksource(); spin_unlock_irqrestore(&clocksource_lock, flags); } -EXPORT_SYMBOL(clocksource_reselect); #ifdef CONFIG_SYSFS /** @@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) static ssize_t sysfs_override_clocksource(struct sys_device *dev, const char *buf, size_t count) { + struct clocksource *ovr = NULL; + struct list_head *tmp; size_t ret = count; + int len; + /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) return -EINVAL; @@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /* strip of \n: */ if (buf[count-1] == '\n') count--; - if (count < 1) - return -EINVAL; spin_lock_irq(&clocksource_lock); - /* copy the name given: */ - memcpy(override_name, buf, count); + if (count > 0) + memcpy(override_name, buf, count); override_name[count] = 0; - /* try to select it: */ - next_clocksource = select_clocksource(); + len = strlen(override_name); + if (len) { + ovr = clocksource_override; + /* try to select it: */ + list_for_each(tmp, &clocksource_list) { + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (strlen(cs->name) == len && + !strcmp(cs->name, override_name)) + ovr = cs; + } + } + + /* Reselect, when the override name has changed */ + if (ovr != clocksource_override) { + clocksource_override = ovr; + next_clocksource = select_clocksource(); + } spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a99b2a6e6a07..3be8da8fed7e 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, - .is_continuous = 0, /* tick based, not free running */ }; static int __init init_jiffies_clocksource(void) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3afeaa3a73f9..eb12509e00bd 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base; #define MAX_TICKADJ 500 /* microsecs */ #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - TICK_LENGTH_SHIFT) / HZ) + TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables @@ -46,13 +46,17 @@ long time_adjust; static void ntp_update_frequency(void) { - tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; - tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << TICK_LENGTH_SHIFT; + second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); - do_div(tick_length_base, HZ); + tick_length_base = second_length; - tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; + do_div(second_length, HZ); + tick_nsec = second_length >> TICK_LENGTH_SHIFT; + + do_div(tick_length_base, NTP_INTERVAL_FREQ); } /** @@ -162,7 +166,7 @@ void second_overflow(void) tick_length -= MAX_TICKADJ_SCALED; } else { tick_length += (s64)(time_adjust * NSEC_PER_USEC / - HZ) << TICK_LENGTH_SHIFT; + NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; time_adjust = 0; } } @@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + time_freq = ((s64)txc->freq * NSEC_PER_USEC) + >> (SHIFT_USEC - SHIFT_NSEC); } if (txc->modes & ADJ_MAXERROR) { @@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc) freq_adj += time_freq; freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); - time_offset = (time_offset / HZ) << SHIFT_UPDATE; + time_offset = (time_offset / NTP_INTERVAL_FREQ) + << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) @@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else - txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; - txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->offset = shift_right(time_offset, SHIFT_UPDATE) + * NTP_INTERVAL_FREQ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) + << (SHIFT_USEC - SHIFT_NSEC); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 000000000000..12b3efeb9f6f --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,480 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +struct tick_device tick_broadcast_device; +static cpumask_t tick_broadcast_mask; +static DEFINE_SPINLOCK(tick_broadcast_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +cpumask_t *tick_get_broadcast_mask(void) +{ + return &tick_broadcast_mask; +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +int tick_check_broadcast_device(struct clock_event_device *dev) +{ + if (tick_broadcast_device.evtdev || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + clockevents_exchange_device(NULL, dev); + tick_broadcast_device.evtdev = dev; + if (!cpus_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + return 1; +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + cpu_set(cpu, tick_broadcast_mask); + tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + ret = 1; + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +/* + * Broadcast the event to the cpus, which are set in the mask + */ +int tick_do_broadcast(cpumask_t mask) +{ + int ret = 0, cpu = smp_processor_id(); + struct tick_device *td; + + /* + * Check, if the current cpu is in the mask + */ + if (cpu_isset(cpu, mask)) { + cpu_clear(cpu, mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->event_handler(td->evtdev); + ret = 1; + } + + if (!cpus_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + cpu = first_cpu(mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->broadcast(mask); + ret = 1; + } + return ret; +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static void tick_do_periodic_broadcast(void) +{ + cpumask_t mask; + + spin_lock(&tick_broadcast_lock); + + cpus_and(mask, cpu_online_map, tick_broadcast_mask); + tick_do_broadcast(mask); + + spin_unlock(&tick_broadcast_lock); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + dev->next_event.tv64 = KTIME_MAX; + + tick_do_periodic_broadcast(); + + /* + * The device is in periodic mode. No reprogramming necessary: + */ + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + return; + + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_do_periodic_broadcast(); + } +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +static void tick_do_broadcast_on_off(void *why) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags, *reason = why; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + bc = tick_broadcast_device.evtdev; + + /* + * Is the device in broadcast mode forever or is it not + * affected by the powerstate ? + */ + if (!dev || !tick_device_is_functional(dev) || + !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { + if (!cpu_isset(cpu, tick_broadcast_mask)) { + cpu_set(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + clockevents_set_mode(dev, + CLOCK_EVT_MODE_SHUTDOWN); + } + } else { + if (cpu_isset(cpu, tick_broadcast_mask)) { + cpu_clear(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + } + + if (cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + else { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop. + */ +void tick_broadcast_on_off(unsigned long reason, int *oncpu) +{ + int cpu = get_cpu(); + + if (cpu == *oncpu) + tick_do_broadcast_on_off(&reason); + else + smp_call_function_single(*oncpu, tick_do_broadcast_on_off, + &reason, 1, 1); + put_cpu(); +} + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_t tick_broadcast_oneshot_mask; + +/* + * Debugging: see timer_list.c + */ +cpumask_t *tick_get_broadcast_oneshot_mask(void) +{ + return &tick_broadcast_oneshot_mask; +} + +static int tick_broadcast_set_event(ktime_t expires, int force) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + ktime_t now = ktime_get(); + int res; + + for(;;) { + res = clockevents_program_event(bc, expires, now); + if (!res || !force) + return res; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); + } +} + +/* + * Reprogram the broadcast device: + * + * Called with tick_broadcast_lock held and interrupts disabled. + */ +static int tick_broadcast_reprogram(void) +{ + ktime_t expires = { .tv64 = KTIME_MAX }; + struct tick_device *td; + int cpu; + + /* + * Find the event which expires next: + */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 < expires.tv64) + expires = td->evtdev->next_event; + } + + if (expires.tv64 == KTIME_MAX) + return 0; + + return tick_broadcast_set_event(expires, 0); +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + cpumask_t mask; + ktime_t now; + int cpu; + + spin_lock(&tick_broadcast_lock); +again: + dev->next_event.tv64 = KTIME_MAX; + mask = CPU_MASK_NONE; + now = ktime_get(); + /* Find all expired events */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 <= now.tv64) + cpu_set(cpu, mask); + } + + /* + * Wakeup the cpus which have an expired event. The broadcast + * device is reprogrammed in the return from idle code. + */ + if (!tick_do_broadcast(mask)) { + /* + * The global event did not expire any CPU local + * events. This happens in dyntick mode, as the + * maximum PIT delta is quite small. + */ + if (tick_broadcast_reprogram()) + goto again; + } + spin_unlock(&tick_broadcast_lock); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +void tick_broadcast_oneshot_control(unsigned long reason) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Periodic mode does not care about the enter/exit of power + * states + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + goto out; + + bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_set(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + if (dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(dev->next_event, 1); + } + } else { + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_clear(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + if (dev->next_event.tv64 != KTIME_MAX) + tick_program_event(dev->next_event, 1); + } + } + +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/** + * tick_broadcast_setup_highres - setup the broadcast device for highres + */ +void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_oneshot_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { + if (bc && cpus_empty(tick_broadcast_oneshot_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 000000000000..4500e347f1bb --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; +static int tick_do_timer_cpu = -1; +DEFINE_SPINLOCK(tick_device_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&xtime_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&xtime_lock); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + + tick_periodic(cpu); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return; + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_periodic(cpu); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&xtime_lock); + next = tick_next_period; + } while (read_seqretry(&xtime_lock, seq)); + + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + cpumask_t cpumask) +{ + ktime_t next_event; + void (*handler)(struct clock_event_device *) = NULL; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == -1) { + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); + tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpus_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +/* + * Check, if the new registered device should be used. + */ +static int tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu, ret = NOTIFY_OK; + unsigned long flags; + cpumask_t cpumask; + + spin_lock_irqsave(&tick_device_lock, flags); + + cpu = smp_processor_id(); + if (!cpu_isset(cpu, newdev->cpumask)) + goto out; + + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + cpumask = cpumask_of_cpu(cpu); + + /* cpu local device ? */ + if (!cpus_equal(newdev->cpumask, cpumask)) { + + /* + * If the cpu affinity of the device interrupt can not + * be set, ignore it. + */ + if (!irq_can_set_affinity(newdev->irq)) + goto out_bc; + + /* + * If we have a cpu local device already, do not replace it + * by a non cpu local device + */ + if (curdev && cpus_equal(curdev->cpumask, cpumask)) + goto out_bc; + } + + /* + * If we have an active device, then check the rating and the oneshot + * feature. + */ + if (curdev) { + /* + * Prefer one shot capable devices ! + */ + if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + goto out_bc; + /* + * Check the rating + */ + if (curdev->rating >= newdev->rating) + goto out_bc; + } + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + + spin_unlock_irqrestore(&tick_device_lock, flags); + return NOTIFY_STOP; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + if (tick_check_broadcast_device(newdev)) + ret = NOTIFY_STOP; +out: + spin_unlock_irqrestore(&tick_device_lock, flags); + + return ret; +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +static void tick_shutdown(unsigned int *cpup) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct clock_event_device *dev = td->evtdev; + unsigned long flags; + + spin_lock_irqsave(&tick_device_lock, flags); + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevents_exchange_device(dev, NULL); + td->evtdev = NULL; + } + spin_unlock_irqrestore(&tick_device_lock, flags); +} + +/* + * Notification about clock event devices + */ +static int tick_notify(struct notifier_block *nb, unsigned long reason, + void *dev) +{ + switch (reason) { + + case CLOCK_EVT_NOTIFY_ADD: + return tick_check_new_device(dev); + + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + tick_broadcast_on_off(reason, dev); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(dev); + tick_shutdown_broadcast(dev); + tick_shutdown(dev); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block tick_notifier = { + .notifier_call = tick_notify, +}; + +/** + * tick_init - initialize the tick control + * + * Register the notifier with the clockevents framework + */ +void __init tick_init(void) +{ + clockevents_register_notifier(&tick_notifier); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 000000000000..54861a0f29ff --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,110 @@ +/* + * tick internal variable and functions used by low/high res code + */ +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern spinlock_t tick_device_lock; +extern ktime_t tick_next_period; +extern ktime_t tick_period; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); + +/* + * NO_HZ / high resolution timer shared code + */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern void tick_broadcast_oneshot_control(unsigned long reason); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +# else /* BROADCAST */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +# endif /* !BROADCAST */ + +#else /* !ONESHOT */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) +{ + BUG(); +} +static inline int tick_program_event(ktime_t expires, int force) +{ + return 0; +} +static inline void tick_oneshot_notify(void) { } +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +#endif /* !TICK_ONESHOT */ + +/* + * Broadcasting support + */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_do_broadcast(cpumask_t mask); + +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); +extern void tick_shutdown_broadcast(unsigned int *cpup); + +extern void +tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); + +#else /* !BROADCAST */ + +static inline int tick_check_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} + +static inline int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, + int cpu) +{ + return 0; +} +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } +static inline void tick_shutdown_broadcast(unsigned int *cpup) { } + +/* + * Set the periodic handler in non broadcast mode + */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, + int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +#endif /* !BROADCAST */ + +/* + * Check, if the device is functional or a dummy for broadcast + */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 000000000000..2e8b7ff863cc --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,84 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + ktime_t now = ktime_get(); + + while (1) { + int ret = clockevents_program_event(dev, expires, now); + + if (!ret || !force) + return ret; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + } +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_program_event(newdev, next_event, ktime_get()); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) + return -EINVAL; + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 000000000000..95e41f7f850b --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,563 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Per cpu nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +/* + * The time, when the last jiffy update happened. Protected by xtime_lock. + */ +static ktime_t last_jiffies_update; + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* Reevalute with xtime_lock held */ + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= tick_period.tv64) { + + delta = ktime_sub(delta, tick_period); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= tick_period.tv64)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } + do_timer(++ticks); + } + write_sequnlock(&xtime_lock); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&xtime_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&xtime_lock); + return period; +} + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ +/* + * NO HZ enabled ? + */ +static int tick_nohz_enabled __read_mostly = 1; + +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + if (!strcmp(str, "off")) + tick_nohz_enabled = 0; + else if (!strcmp(str, "on")) + tick_nohz_enabled = 1; + else + return 0; + return 1; +} + +__setup("nohz=", setup_tick_nohz); + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +void tick_nohz_update_jiffies(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long flags; + ktime_t now; + + if (!ts->tick_stopped) + return; + + cpu_clear(cpu, nohz_cpu_mask); + now = ktime_get(); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); +} + +/** + * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called either from the idle loop or from irq_exit() when an idle period was + * just interrupted by an interrupt which did not cause a reschedule. + */ +void tick_nohz_stop_sched_tick(void) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + struct tick_sched *ts; + ktime_t last_update, expires, now, delta; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + ts = &per_cpu(tick_cpu_sched, cpu); + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + goto end; + + if (need_resched()) + goto end; + + cpu = smp_processor_id(); + BUG_ON(local_softirq_pending()); + + now = ktime_get(); + /* + * When called from irq_exit we need to account the idle sleep time + * correctly. + */ + if (ts->tick_stopped) { + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + } + + ts->idle_entrytime = now; + ts->idle_calls++; + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&xtime_lock, seq)); + + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + + /* + * Do not stop the tick, if we are only one off + * or if the cpu is required for rcu + */ + if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu))) + goto out; + + /* Schedule the tick, if we are at least one jiffie off */ + if ((long)delta_jiffies >= 1) { + + if (rcu_needs_cpu(cpu)) + delta_jiffies = 1; + else + cpu_set(cpu, nohz_cpu_mask); + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + ts->idle_tick = ts->sched_timer.expires; + ts->tick_stopped = 1; + ts->idle_jiffies = last_jiffies; + } + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + ts->idle_expires = expires; + ts->idle_sleeps++; + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + goto out; + } else if(!tick_program_event(expires, 0)) + goto out; + /* + * We are past the event already. So we crossed a + * jiffie boundary. Update jiffies and raise the + * softirq. + */ + tick_do_update_jiffies64(ktime_get()); + cpu_clear(cpu, nohz_cpu_mask); + } + raise_softirq_irqoff(TIMER_SOFTIRQ); +out: + ts->next_jiffies = next_jiffies; + ts->last_jiffies = last_jiffies; +end: + local_irq_restore(flags); +} + +/** + * nohz_restart_sched_tick - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + */ +void tick_nohz_restart_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long ticks; + ktime_t now, delta; + + if (!ts->tick_stopped) + return; + + /* Update jiffies first */ + now = ktime_get(); + + local_irq_disable(); + tick_do_update_jiffies64(now); + cpu_clear(cpu, nohz_cpu_mask); + + /* Account the idle time */ + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) { + add_preempt_count(HARDIRQ_OFFSET); + account_system_time(current, HARDIRQ_OFFSET, + jiffies_to_cputime(ticks)); + sub_preempt_count(HARDIRQ_OFFSET); + } + + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } + local_irq_enable(); +} + +static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) +{ + hrtimer_forward(&ts->sched_timer, now, tick_period); + return tick_program_event(ts->sched_timer.expires, 0); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event.tv64 = KTIME_MAX; + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start + * of idle" jiffy stamp so the idle accounting adjustment we + * do when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return; + + while (tick_nohz_reprogram(ts, now)) { + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + local_irq_disable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) { + local_irq_enable(); + return; + } + + ts->nohz_mode = NOHZ_MODE_LOWRES; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + for (;;) { + ts->sched_timer.expires = next; + if (!tick_program_event(next, 0)) + break; + next = ktime_add(next, tick_period); + } + local_irq_enable(); + + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", + smp_processor_id()); +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } + +#endif /* NO_HZ */ + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code + * Called with interrupts disabled and timer->base->cpu_base->lock held. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct hrtimer_cpu_base *base = timer->base->cpu_base; + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) { + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + /* + * update_process_times() might take tasklist_lock, hence + * drop the base lock. sched-tick hrtimers are per-CPU and + * never accessible by userspace APIs, so this is safe to do. + */ + spin_unlock(&base->lock); + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + spin_lock(&base->lock); + } + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + /* Get the next period */ + ts->sched_timer.expires = tick_init_jiffy_update(); + + for (;;) { + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + now = ktime_get(); + } + +#ifdef CONFIG_NO_HZ + if (tick_nohz_enabled) + ts->nohz_mode = NOHZ_MODE_HIGHRES; +#endif +} + +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); + ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; +} +#endif /* HIGH_RES_TIMERS */ + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 000000000000..f82c635c3d5c --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,287 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/tick.h> + +#include <asm/uaccess.h> + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void print_name_offset(struct seq_file *m, void *sym) +{ + unsigned long addr = (unsigned long)sym; + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + SEQ_printf(m, "%s", sym_name); + else + SEQ_printf(m, "<%p>", sym); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +{ +#ifdef CONFIG_TIMER_STATS + char tmp[TASK_COMM_LEN + 1]; +#endif + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, timer); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02lx", timer->state); +#ifdef CONFIG_TIMER_STATS + SEQ_printf(m, ", "); + print_name_offset(m, timer->start_site); + memcpy(tmp, timer->start_comm, TASK_COMM_LEN); + tmp[TASK_COMM_LEN] = 0; + SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); +#endif + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(timer->expires), + (unsigned long long)(ktime_to_ns(timer->expires) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct rb_node *curr; + unsigned long flags; + +next_one: + i = 0; + spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = base->first; + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = rb_next(curr); + i++; + } + + if (curr) { + + timer = rb_entry(curr, struct hrtimer, node); + tmp = *timer; + spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, &tmp, i, now); + next++; + goto next_one; + } + spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .index: %d\n", + base->index); + SEQ_printf(m, " .resolution: %Ld nsecs\n", + (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Ld nsecs\n", + ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "\ncpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(idle_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_sleeptime); + P(last_jiffies); + P(next_jiffies); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); + } +#endif + +#undef P +#undef P_ns +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td) +{ + struct clock_event_device *dev = td->evtdev; + + SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); + SEQ_printf(m, " mult: %ld\n", dev->mult); + SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices(struct seq_file *m) +{ + int cpu; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device()); + SEQ_printf(m, "tick_broadcast_mask: %08lx\n", + tick_get_broadcast_mask()->bits[0]); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", + tick_get_broadcast_oneshot_mask()->bits[0]); +#endif + SEQ_printf(m, "\n"); +#endif + for_each_online_cpu(cpu) + print_tickdevice(m, tick_get_device(cpu)); + SEQ_printf(m, "\n"); +} +#else +static void timer_list_show_tickdevices(struct seq_file *m) { } +#endif + +static int timer_list_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + + for_each_online_cpu(cpu) + print_cpu(m, cpu, now); + + SEQ_printf(m, "\n"); + timer_list_show_tickdevices(m); + + return 0; +} + +void sysrq_timer_list_show(void) +{ + timer_list_show(NULL, NULL); +} + +static int timer_list_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timer_list_show, NULL); +} + +static struct file_operations timer_list_fops = { + .open = timer_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_list", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &timer_list_fops; + + return 0; +} +__initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 000000000000..1bc4882e28e0 --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,411 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini <d.pensator@gmail.com> + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo 1[0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(spinlock_t, lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +static int __read_mostly active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + memcpy(curr->comm, comm, TASK_COMM_LEN); + if (prev) + prev->next = curr; + else + *head = curr; + curr->next = NULL; + } + out_unlock: + spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm) +{ + /* + * It doesnt matter which lock we take: + */ + spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + struct entry *entry, input; + unsigned long flags; + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + + spin_lock_irqsave(lock, flags); + if (!active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + seq_printf(m, "%4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, + events / period.tv_sec, events * 1000 / ms); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + /* nothing */ + spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (active) { + active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!active) { + reset_entries(); + time_start = ktime_get(); + active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu(lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_stats", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &tstats_fops; + + return 0; +} +__initcall(init_tstats_procfs); diff --git a/kernel/timer.c b/kernel/timer.c index 8533c3796082..cb1b86a9c52f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -34,6 +34,8 @@ #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); static inline void detach_timer(struct timer_list *timer, - int clear_pending) + int clear_pending) { struct list_head *entry = &timer->entry; @@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) unsigned long flags; int ret = 0; + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); @@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu) tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer->base = base; @@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); + timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base) fn = timer->function; data = timer->data; + timer_stats_account_timer(timer); + set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. * This functions needs to be called disabled. */ -unsigned long next_timer_interrupt(void) +static unsigned long __next_timer_interrupt(tvec_base_t *base) { - tvec_base_t *base; - struct list_head *list; + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + (LONG_MAX >> 1); + int index, slot, array, found = 0; struct timer_list *nte; - unsigned long expires; - unsigned long hr_expires = MAX_JIFFY_OFFSET; - ktime_t hr_delta; tvec_t *varray[4]; - int i, j; - - hr_delta = hrtimer_get_next_event(); - if (hr_delta.tv64 != KTIME_MAX) { - struct timespec tsdelta; - tsdelta = ktime_to_timespec(hr_delta); - hr_expires = timespec_to_jiffies(&tsdelta); - if (hr_expires < 3) - return hr_expires + jiffies; - } - hr_expires += jiffies; - - base = __get_cpu_var(tvec_bases); - spin_lock(&base->lock); - expires = base->timer_jiffies + (LONG_MAX >> 1); - list = NULL; /* Look for timer events in tv1. */ - j = base->timer_jiffies & TVR_MASK; + index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + j, entry) { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + found = 1; expires = nte->expires; - if (j < (base->timer_jiffies & TVR_MASK)) - list = base->tv2.vec + (INDEX(0)); - goto found; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; } - j = (j + 1) & TVR_MASK; - } while (j != (base->timer_jiffies & TVR_MASK)); + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; /* Check tv2-tv5. */ varray[0] = &base->tv2; varray[1] = &base->tv3; varray[2] = &base->tv4; varray[3] = &base->tv5; - for (i = 0; i < 4; i++) { - j = INDEX(i); + + for (array = 0; array < 4; array++) { + tvec_t *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; do { - if (list_empty(varray[i]->vec + j)) { - j = (j + 1) & TVN_MASK; - continue; - } - list_for_each_entry(nte, varray[i]->vec + j, entry) + list_for_each_entry(nte, varp->vec + slot, entry) { + found = 1; if (time_before(nte->expires, expires)) expires = nte->expires; - if (j < (INDEX(i)) && i < 3) - list = varray[i + 1]->vec + (INDEX(i + 1)); - goto found; - } while (j != (INDEX(i))); - } -found: - if (list) { - /* - * The search wrapped. We need to look at the next list - * from next tv element that would cascade into tv element - * where we found the timer element. - */ - list_for_each_entry(nte, list, entry) { - if (time_before(nte->expires, expires)) - expires = nte->expires; - } + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; } - spin_unlock(&base->lock); + return expires; +} - /* - * It can happen that other CPUs service timer IRQs and increment - * jiffies, but we have not yet got a local timer tick to process - * the timer wheels. In that case, the expiry time can be before - * jiffies, but since the high-resolution timer here is relative to - * jiffies, the default expression when high-resolution timers are - * not active, - * - * time_before(MAX_JIFFY_OFFSET + jiffies, expires) - * - * would falsely evaluate to true. If that is the case, just - * return jiffies so that we can immediately fire the local timer - */ - if (time_before(expires, jiffies)) - return jiffies; +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; - if (time_before(hr_expires, expires)) - return hr_expires; + if (hr_delta.tv64 <= TICK_NSEC) + return now; + tsdelta = ktime_to_timespec(hr_delta); + now += timespec_to_jiffies(&tsdelta); + if (time_before(now, expires)) + return now; return expires; } + +/** + * next_timer_interrupt - return the jiffy of the next pending timer + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + tvec_base_t *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} + +#ifdef CONFIG_NO_IDLE_HZ +unsigned long next_timer_interrupt(void) +{ + return get_next_timer_interrupt(jiffies); +} +#endif + #endif /******************************************************************/ @@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static int change_clocksource(void) +static void change_clocksource(void) { struct clocksource *new; cycle_t now; u64 nsec; + new = clocksource_get_next(); - if (clock != new) { - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - return 1; - } else if (clock->update_callback) { - return clock->update_callback(); - } - return 0; + + if (clock == new) + return; + + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + + tick_clock_notify(); + + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); } #else -static inline int change_clocksource(void) -{ - return 0; -} +static inline void change_clocksource(void) { } #endif /** @@ -871,33 +918,56 @@ int timekeeping_is_continuous(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->is_continuous; + ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); return ret; } +/** + * read_persistent_clock - Return time in seconds from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Returns seconds from epoch using the battery backed persistent clock. + * Returns zero if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +unsigned long __attribute__((weak)) read_persistent_clock(void) +{ + return 0; +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { unsigned long flags; + unsigned long sec = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, tick_nsec); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + write_sequnlock_irqrestore(&xtime_lock, flags); } - +/* flag for if timekeeping is suspended */ static int timekeeping_suspended; +/* time in seconds when suspend began */ +static unsigned long timekeeping_suspend_time; + /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -909,13 +979,26 @@ static int timekeeping_suspended; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; + unsigned long now = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* restart the last cycle value */ + + if (now && (now > timekeeping_suspend_time)) { + unsigned long sleep_length = now - timekeeping_suspend_time; + + xtime.tv_sec += sleep_length; + wall_to_monotonic.tv_sec -= sleep_length; + } + /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + /* Resume hrtimers */ + clock_was_set(); + return 0; } @@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) write_seqlock_irqsave(&xtime_lock, flags); timekeeping_suspended = 1; + timekeeping_suspend_time = read_persistent_clock(); write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -1089,11 +1173,8 @@ static void update_wall_time(void) clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; /* check to see if there is a new clocksource to use */ - if (change_clocksource()) { - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, tick_nsec); - } + change_clocksource(); + update_vsyscall(&xtime, clock); } /* @@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks) * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -#ifndef ARCH_HAVE_XTIME_LOCK -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); EXPORT_SYMBOL(xtime_lock); -#endif /* * This function runs timers and the timer-tq in bottom half context. @@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h) { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1621,6 +1701,8 @@ void __init init_timers(void) int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + init_timer_stats(); + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index baacc3691415..658f638c402c 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -22,8 +22,6 @@ #include <linux/acct.h> #include <linux/jiffies.h> - -#define USEC_PER_TICK (USEC_PER_SEC/HZ) /* * fill in basic accounting fields */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c new file mode 100644 index 000000000000..f22b9dbd2a9c --- /dev/null +++ b/kernel/utsname_sysctl.c @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2007 + * + * Author: Eric Biederman <ebiederm@xmision.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <linux/version.h> +#include <linux/sysctl.h> + +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; +#ifdef CONFIG_UTS_NS + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; +#endif + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + +#ifdef CONFIG_PROC_FS +/* + * Special case of dostring for the UTS structure. This has locks + * to observe. Should this be in kernel/sys.c ???? + */ +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table uts_table; + int r; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + put_uts(table, write, uts_table.data); + return r; +} +#else +#define proc_do_uts_string NULL +#endif + + +#ifdef CONFIG_SYSCTL_SYSCALL +/* The generic string strategy routine: */ +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct ctl_table uts_table; + int r, write; + write = newval && newlen; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = sysctl_string(&uts_table, name, nlen, + oldval, oldlenp, newval, newlen); + put_uts(table, write, uts_table.data); + return r; +} +#else +#define sysctl_uts_string NULL +#endif + +static struct ctl_table uts_kern_table[] = { + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), + .mode = 0444, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), + .mode = 0444, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), + .mode = 0444, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), + .mode = 0644, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), + .mode = 0644, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + {} +}; + +static struct ctl_table uts_root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = uts_kern_table, + }, + {} +}; + +static int __init utsname_sysctl_init(void) +{ + register_sysctl_table(uts_root_table); + return 0; +} + +__initcall(utsname_sysctl_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 020d1fff57dc..b6fa5e63085d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct workqueue_struct *wq = get_wq_data(&dwork->work); @@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + timer_stats_timer_set_start_info(timer); if (delay == 0) return queue_work(wq, work); @@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work); * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +int fastcall schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { + timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); |