diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 12 | ||||
-rw-r--r-- | kernel/audit.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 69 | ||||
-rw-r--r-- | kernel/exit.c | 11 | ||||
-rw-r--r-- | kernel/fork.c | 22 | ||||
-rw-r--r-- | kernel/futex.c | 4 | ||||
-rw-r--r-- | kernel/futex_compat.c | 4 | ||||
-rw-r--r-- | kernel/hrtimer.c | 50 | ||||
-rw-r--r-- | kernel/irq/Makefile | 3 | ||||
-rw-r--r-- | kernel/irq/migration.c | 5 | ||||
-rw-r--r-- | kernel/module.c | 1 | ||||
-rw-r--r-- | kernel/panic.c | 1 | ||||
-rw-r--r-- | kernel/pid.c | 212 | ||||
-rw-r--r-- | kernel/power/Kconfig | 2 | ||||
-rw-r--r-- | kernel/power/pm.c | 20 | ||||
-rw-r--r-- | kernel/power/process.c | 3 | ||||
-rw-r--r-- | kernel/printk.c | 6 | ||||
-rw-r--r-- | kernel/ptrace.c | 10 | ||||
-rw-r--r-- | kernel/sched.c | 144 | ||||
-rw-r--r-- | kernel/signal.c | 12 | ||||
-rw-r--r-- | kernel/sys.c | 19 | ||||
-rw-r--r-- | kernel/sys_ni.c | 12 | ||||
-rw-r--r-- | kernel/time.c | 8 | ||||
-rw-r--r-- | kernel/timer.c | 126 |
24 files changed, 468 insertions, 290 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 065d8b4e51ef..b327f4d20104 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file) /* calculate run_time in nsec*/ do_posix_clock_monotonic_gettime(&uptime); run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC - + current->start_time.tv_nsec; + run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC + + current->group_leader->start_time.tv_nsec; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 @@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, + jiffies = cputime_to_jiffies(cputime_add(current->utime, current->signal->utime)); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); - jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, + jiffies = cputime_to_jiffies(cputime_add(current->stime, current->signal->stime)); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ @@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->group_leader->min_flt); + current->min_flt); ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->group_leader->maj_flt); + current->maj_flt); ac.ac_swaps = encode_comp_t(0); ac.ac_exitcode = exitcode; diff --git a/kernel/audit.c b/kernel/audit.c index 04fe2e301b61..c8ccbd09048f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -578,7 +578,7 @@ static int __init audit_enable(char *str) audit_initialized ? "" : " (after initialization)"); if (audit_initialized) audit_enabled = audit_default; - return 0; + return 1; } __setup("audit=", audit_enable); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 18aea1bd1284..72248d1b9e3f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * current->cpuset if a task has its memory placement changed. * Do not call this routine if in_interrupt(). * - * Call without callback_mutex or task_lock() held. May be called - * with or without manage_mutex held. Doesn't need task_lock to guard - * against another task changing a non-NULL cpuset pointer to NULL, - * as that is only done by a task on itself, and if the current task - * is here, it is not simultaneously in the exit code NULL'ing its - * cpuset pointer. This routine also might acquire callback_mutex and + * Call without callback_mutex or task_lock() held. May be + * called with or without manage_mutex held. Thanks in part to + * 'the_top_cpuset_hack', the tasks cpuset pointer will never + * be NULL. This routine also might acquire callback_mutex and * current->mm->mmap_sem during call. * * Reading current->cpuset->mems_generation doesn't need task_lock @@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf) } /* + * cpuset_migrate_mm + * + * Migrate memory region from one set of nodes to another. + * + * Temporarilly set tasks mems_allowed to target nodes of migration, + * so that the migration code can allocate pages on these nodes. + * + * Call holding manage_mutex, so our current->cpuset won't change + * during this call, as manage_mutex holds off any attach_task() + * calls. Therefore we don't need to take task_lock around the + * call to guarantee_online_mems(), as we know no one is changing + * our tasks cpuset. + * + * Hold callback_mutex around the two modifications of our tasks + * mems_allowed to synchronize with cpuset_mems_allowed(). + * + * While the mm_struct we are migrating is typically from some + * other task, the task_struct mems_allowed that we are hacking + * is for our current task, which must allocate new pages for that + * migrating memory region. + * + * We call cpuset_update_task_memory_state() before hacking + * our tasks mems_allowed, so that we are assured of being in + * sync with our tasks cpuset, and in particular, callbacks to + * cpuset_update_task_memory_state() from nested page allocations + * won't see any mismatch of our cpuset and task mems_generation + * values, so won't overwrite our hacked tasks mems_allowed + * nodemask. + */ + +static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to) +{ + struct task_struct *tsk = current; + + cpuset_update_task_memory_state(); + + mutex_lock(&callback_mutex); + tsk->mems_allowed = *to; + mutex_unlock(&callback_mutex); + + do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + + mutex_lock(&callback_mutex); + guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); + mutex_unlock(&callback_mutex); +} + +/* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed and mems_generation, and for each @@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) struct mm_struct *mm = mmarray[i]; mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) { - do_migrate_pages(mm, &oldmem, &cs->mems_allowed, - MPOL_MF_MOVE_ALL); - } + if (migrate) + cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); mmput(mm); } @@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, &to); + if (is_memory_migrate(cs)) + cpuset_migrate_mm(mm, &from, &to); mmput(mm); } - if (is_memory_migrate(cs)) - do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); synchronize_rcu(); if (atomic_dec_and_test(&oldcs->count)) diff --git a/kernel/exit.c b/kernel/exit.c index bc0ec674d3f4..1a9787ac6173 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -34,6 +34,7 @@ #include <linux/mutex.h> #include <linux/futex.h> #include <linux/compat.h> +#include <linux/pipe_fs_i.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -127,6 +128,11 @@ static void __exit_signal(struct task_struct *tsk) } } +static void delayed_put_task_struct(struct rcu_head *rhp) +{ + put_task_struct(container_of(rhp, struct task_struct, rcu)); +} + void release_task(struct task_struct * p) { int zap_leader; @@ -168,7 +174,7 @@ repeat: spin_unlock(&p->proc_lock); proc_pid_flush(proc_dentry); release_thread(p); - put_task_struct(p); + call_rcu(&p->rcu, delayed_put_task_struct); p = leader; if (unlikely(zap_leader)) @@ -936,6 +942,9 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->io_context) exit_io_context(); + if (tsk->splice_pipe) + __free_pipe_info(tsk->splice_pipe); + /* PF_DEAD causes final put_task_struct after we schedule. */ preempt_disable(); BUG_ON(tsk->flags & PF_DEAD); diff --git a/kernel/fork.c b/kernel/fork.c index b3f7a1bb5e55..54b15f8cda53 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -108,10 +108,8 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct_cb(struct rcu_head *rhp) +void __put_task_struct(struct task_struct *tsk) { - struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); - WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); @@ -721,7 +719,7 @@ out_release: free_fdset (new_fdt->open_fds, new_fdt->max_fdset); free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); - goto out; + return NULL; } static int copy_files(unsigned long clone_flags, struct task_struct * tsk) @@ -1311,17 +1309,19 @@ long do_fork(unsigned long clone_flags, { struct task_struct *p; int trace = 0; - long pid = alloc_pidmap(); + struct pid *pid = alloc_pid(); + long nr; - if (pid < 0) + if (!pid) return -EAGAIN; + nr = pid->nr; if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) clone_flags |= CLONE_PTRACE; } - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1348,7 +1348,7 @@ long do_fork(unsigned long clone_flags, p->state = TASK_STOPPED; if (unlikely (trace)) { - current->ptrace_message = pid; + current->ptrace_message = nr; ptrace_notify ((trace << 8) | SIGTRAP); } @@ -1358,10 +1358,10 @@ long do_fork(unsigned long clone_flags, ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); } } else { - free_pidmap(pid); - pid = PTR_ERR(p); + free_pid(pid); + nr = PTR_ERR(p); } - return pid; + return nr; } #ifndef ARCH_MIN_MMSTRUCT_ALIGN diff --git a/kernel/futex.c b/kernel/futex.c index 9c9b2b6b22dd..5699c512057b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1039,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if ((op == FUTEX_WAIT) && utime) { + if (utime && (op == FUTEX_WAIT)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; + if (!timespec_valid(&t)) + return -EINVAL; timeout = timespec_to_jiffies(&t) + 1; } /* diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 54274fc85321..1ab6a0ea3d14 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -129,9 +129,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if ((op == FUTEX_WAIT) && utime) { + if (utime && (op == FUTEX_WAIT)) { if (get_compat_timespec(&t, utime)) return -EFAULT; + if (!timespec_valid(&t)) + return -EINVAL; timeout = timespec_to_jiffies(&t) + 1; } if (op >= FUTEX_REQUEUE) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0237a556eb1f..d2a7296c8251 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -501,6 +501,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; + cpu_relax(); } } @@ -606,6 +607,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) { struct rb_node *node; + if (!base->first) + return; + if (base->get_softirq_time) base->softirq_time = base->get_softirq_time(); @@ -655,29 +659,28 @@ void hrtimer_run_queues(void) /* * Sleep related functions: */ - -struct sleep_hrtimer { - struct hrtimer timer; - struct task_struct *task; - int expired; -}; - -static int nanosleep_wakeup(struct hrtimer *timer) +static int hrtimer_wakeup(struct hrtimer *timer) { - struct sleep_hrtimer *t = - container_of(timer, struct sleep_hrtimer, timer); + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; - t->expired = 1; - wake_up_process(t->task); + t->task = NULL; + if (task) + wake_up_process(task); return HRTIMER_NORESTART; } -static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) { - t->timer.function = nanosleep_wakeup; - t->task = current; - t->expired = 0; + sl->timer.function = hrtimer_wakeup; + sl->task = task; +} + +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) +{ + hrtimer_init_sleeper(t, current); do { set_current_state(TASK_INTERRUPTIBLE); @@ -685,18 +688,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) schedule(); - if (unlikely(!t->expired)) { - hrtimer_cancel(&t->timer); - mode = HRTIMER_ABS; - } - } while (!t->expired && !signal_pending(current)); + hrtimer_cancel(&t->timer); + mode = HRTIMER_ABS; + + } while (t->task && !signal_pending(current)); - return t->expired; + return t->task == NULL; } static long __sched nanosleep_restart(struct restart_block *restart) { - struct sleep_hrtimer t; + struct hrtimer_sleeper t; struct timespec __user *rmtp; struct timespec tu; ktime_t time; @@ -729,7 +731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; - struct sleep_hrtimer t; + struct hrtimer_sleeper t; struct timespec tu; ktime_t rem; diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2b33f852be3e..9f77f50d8143 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,4 +1,5 @@ -obj-y := handle.o manage.o spurious.o migration.o +obj-y := handle.o manage.o spurious.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 52a8655fa080..134f9f2e0e39 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,6 +1,5 @@ -#include <linux/irq.h> -#if defined(CONFIG_GENERIC_PENDING_IRQ) +#include <linux/irq.h> void set_pending_irq(unsigned int irq, cpumask_t mask) { @@ -61,5 +60,3 @@ void move_native_irq(int irq) } cpus_clear(pending_irq_cpumask[irq]); } - -#endif diff --git a/kernel/module.c b/kernel/module.c index bd088a7c1499..d24deb0dbbc9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1254,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license) || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 + || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } diff --git a/kernel/panic.c b/kernel/panic.c index f895c7c01d5b..cc2a4c9c36ac 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,6 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; -EXPORT_SYMBOL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/pid.c b/kernel/pid.c index a9f2dfd006d2..eeb836b65ca4 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -28,8 +28,9 @@ #include <linux/hash.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) -static struct hlist_head *pid_hash[PIDTYPE_MAX]; +static struct hlist_head *pid_hash; static int pidhash_shift; +static kmem_cache_t *pid_cachep; int pid_max = PID_MAX_DEFAULT; int last_pid; @@ -60,9 +61,22 @@ typedef struct pidmap { static pidmap_t pidmap_array[PIDMAP_ENTRIES] = { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; +/* + * Note: disable interrupts while the pidmap_lock is held as an + * interrupt might come in and do read_lock(&tasklist_lock). + * + * If we don't disable interrupts there is a nasty deadlock between + * detach_pid()->free_pid() and another cpu that does + * spin_lock(&pidmap_lock) followed by an interrupt routine that does + * read_lock(&tasklist_lock); + * + * After we clean up the tasklist_lock and know there are no + * irq handlers that take it we can leave the interrupts enabled. + * For now it is easier to be safe than to prove it can't happen. + */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -fastcall void free_pidmap(int pid) +static fastcall void free_pidmap(int pid) { pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; @@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid) atomic_inc(&map->nr_free); } -int alloc_pidmap(void) +static int alloc_pidmap(void) { int i, offset, max_scan, pid, last = last_pid; pidmap_t *map; @@ -89,12 +103,12 @@ int alloc_pidmap(void) * Free the page if someone raced with us * installing it: */ - spin_lock(&pidmap_lock); + spin_lock_irq(&pidmap_lock); if (map->page) free_page(page); else map->page = (void *)page; - spin_unlock(&pidmap_lock); + spin_unlock_irq(&pidmap_lock); if (unlikely(!map->page)) break; } @@ -131,13 +145,73 @@ int alloc_pidmap(void) return -1; } -struct pid * fastcall find_pid(enum pid_type type, int nr) +fastcall void put_pid(struct pid *pid) +{ + if (!pid) + return; + if ((atomic_read(&pid->count) == 1) || + atomic_dec_and_test(&pid->count)) + kmem_cache_free(pid_cachep, pid); +} + +static void delayed_put_pid(struct rcu_head *rhp) +{ + struct pid *pid = container_of(rhp, struct pid, rcu); + put_pid(pid); +} + +fastcall void free_pid(struct pid *pid) +{ + /* We can be called with write_lock_irq(&tasklist_lock) held */ + unsigned long flags; + + spin_lock_irqsave(&pidmap_lock, flags); + hlist_del_rcu(&pid->pid_chain); + spin_unlock_irqrestore(&pidmap_lock, flags); + + free_pidmap(pid->nr); + call_rcu(&pid->rcu, delayed_put_pid); +} + +struct pid *alloc_pid(void) +{ + struct pid *pid; + enum pid_type type; + int nr = -1; + + pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); + if (!pid) + goto out; + + nr = alloc_pidmap(); + if (nr < 0) + goto out_free; + + atomic_set(&pid->count, 1); + pid->nr = nr; + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + + spin_lock_irq(&pidmap_lock); + hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); + spin_unlock_irq(&pidmap_lock); + +out: + return pid; + +out_free: + kmem_cache_free(pid_cachep, pid); + pid = NULL; + goto out; +} + +struct pid * fastcall find_pid(int nr) { struct hlist_node *elem; struct pid *pid; hlist_for_each_entry_rcu(pid, elem, - &pid_hash[type][pid_hashfn(nr)], pid_chain) { + &pid_hash[pid_hashfn(nr)], pid_chain) { if (pid->nr == nr) return pid; } @@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr) int fastcall attach_pid(task_t *task, enum pid_type type, int nr) { - struct pid *pid, *task_pid; - - task_pid = &task->pids[type]; - pid = find_pid(type, nr); - task_pid->nr = nr; - if (pid == NULL) { - INIT_LIST_HEAD(&task_pid->pid_list); - hlist_add_head_rcu(&task_pid->pid_chain, - &pid_hash[type][pid_hashfn(nr)]); - } else { - INIT_HLIST_NODE(&task_pid->pid_chain); - list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list); - } + struct pid_link *link; + struct pid *pid; + + WARN_ON(!task->pid); /* to be removed soon */ + WARN_ON(!nr); /* to be removed soon */ + + link = &task->pids[type]; + link->pid = pid = find_pid(nr); + hlist_add_head_rcu(&link->node, &pid->tasks[type]); return 0; } -static fastcall int __detach_pid(task_t *task, enum pid_type type) +void fastcall detach_pid(task_t *task, enum pid_type type) { - struct pid *pid, *pid_next; - int nr = 0; + struct pid_link *link; + struct pid *pid; + int tmp; - pid = &task->pids[type]; - if (!hlist_unhashed(&pid->pid_chain)) { + link = &task->pids[type]; + pid = link->pid; - if (list_empty(&pid->pid_list)) { - nr = pid->nr; - hlist_del_rcu(&pid->pid_chain); - } else { - pid_next = list_entry(pid->pid_list.next, - struct pid, pid_list); - /* insert next pid from pid_list to hash */ - hlist_replace_rcu(&pid->pid_chain, - &pid_next->pid_chain); - } - } + hlist_del_rcu(&link->node); + link->pid = NULL; - list_del_rcu(&pid->pid_list); - pid->nr = 0; + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (!hlist_empty(&pid->tasks[tmp])) + return; - return nr; + free_pid(pid); } -void fastcall detach_pid(task_t *task, enum pid_type type) +struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) { - int tmp, nr; + struct task_struct *result = NULL; + if (pid) { + struct hlist_node *first; + first = rcu_dereference(pid->tasks[type].first); + if (first) + result = hlist_entry(first, struct task_struct, pids[(type)].node); + } + return result; +} - nr = __detach_pid(task, type); - if (!nr) - return; +/* + * Must be called under rcu_read_lock() or with tasklist_lock read-held. + */ +task_t *find_task_by_pid_type(int type, int nr) +{ + return pid_task(find_pid(nr), type); +} - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (tmp != type && find_pid(tmp, nr)) - return; +EXPORT_SYMBOL(find_task_by_pid_type); - free_pidmap(nr); +struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) +{ + struct task_struct *result; + rcu_read_lock(); + result = pid_task(pid, type); + if (result) + get_task_struct(result); + rcu_read_unlock(); + return result; } -task_t *find_task_by_pid_type(int type, int nr) +struct pid *find_get_pid(pid_t nr) { struct pid *pid; - pid = find_pid(type, nr); - if (!pid) - return NULL; + rcu_read_lock(); + pid = get_pid(find_pid(nr)); + rcu_read_unlock(); - return pid_task(&pid->pid_list, type); + return pid; } -EXPORT_SYMBOL(find_task_by_pid_type); - /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or @@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type); */ void __init pidhash_init(void) { - int i, j, pidhash_size; + int i, pidhash_size; unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); pidhash_shift = max(4, fls(megabytes * 4)); @@ -233,16 +312,13 @@ void __init pidhash_init(void) printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", pidhash_size, pidhash_shift, - PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); - - for (i = 0; i < PIDTYPE_MAX; i++) { - pid_hash[i] = alloc_bootmem(pidhash_size * - sizeof(*(pid_hash[i]))); - if (!pid_hash[i]) - panic("Could not alloc pidhash!\n"); - for (j = 0; j < pidhash_size; j++) - INIT_HLIST_HEAD(&pid_hash[i][j]); - } + pidhash_size * sizeof(struct hlist_head)); + + pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); + if (!pid_hash) + panic("Could not alloc pidhash!\n"); + for (i = 0; i < pidhash_size; i++) + INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) @@ -251,4 +327,8 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, pidmap_array->page); atomic_dec(&pidmap_array->nr_free); + + pid_cachep = kmem_cache_create("pid", sizeof(struct pid), + __alignof__(struct pid), + SLAB_PANIC, NULL, NULL); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9fd8d4f03595..ce0dfb8f4a4e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) ---help--- Enable the possibility of suspending the machine. - It doesn't need APM. + It doesn't need ACPI or APM. You may suspend your machine by 'swsusp' or 'shutdown -z <time>' (patch for sysvinit needed). diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 0f6908cce1dd..84063ac8fcfc 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -75,25 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type, return dev; } -/** - * pm_unregister - unregister a device with power management - * @dev: device to unregister - * - * Remove a device from the power management notification lists. The - * dev passed must be a handle previously returned by pm_register. - */ - -void pm_unregister(struct pm_dev *dev) -{ - if (dev) { - mutex_lock(&pm_devs_lock); - list_del(&dev->entry); - mutex_unlock(&pm_devs_lock); - - kfree(dev); - } -} - static void __pm_unregister(struct pm_dev *dev) { if (dev) { @@ -258,7 +239,6 @@ int pm_send_all(pm_request_t rqst, void *data) } EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_unregister); EXPORT_SYMBOL(pm_unregister_all); EXPORT_SYMBOL(pm_send_all); EXPORT_SYMBOL(pm_active); diff --git a/kernel/power/process.c b/kernel/power/process.c index 8ac7c35fad77..b2a5f671d6cd 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p) (p->flags & PF_NOFREEZE) || (p->exit_state == EXIT_ZOMBIE) || (p->exit_state == EXIT_DEAD) || - (p->state == TASK_STOPPED) || - (p->state == TASK_TRACED)) + (p->state == TASK_STOPPED)) return 0; return 1; } diff --git a/kernel/printk.c b/kernel/printk.c index 8cc19431e74b..c056f3324432 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -360,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end) unsigned long cur_index, start_print; static int msg_level = -1; - if (((long)(start - end)) > 0) - BUG(); + BUG_ON(((long)(start - end)) > 0); cur_index = start; start_print = start; @@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options) */ void acquire_console_sem(void) { - if (in_interrupt()) - BUG(); + BUG_ON(in_interrupt()); down(&console_sem); console_locked = 1; console_may_schedule = 1; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 86a7f6c60cb2..4e0f0ec003f7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -30,8 +30,7 @@ */ void __ptrace_link(task_t *child, task_t *new_parent) { - if (!list_empty(&child->ptrace_list)) - BUG(); + BUG_ON(!list_empty(&child->ptrace_list)); if (child->parent == new_parent) return; list_add(&child->ptrace_list, &child->parent->ptrace_children); @@ -57,10 +56,6 @@ void ptrace_untrace(task_t *child) signal_wake_up(child, 1); } } - if (child->signal->flags & SIGNAL_GROUP_EXIT) { - sigaddset(&child->pending.signal, SIGKILL); - signal_wake_up(child, 1); - } spin_unlock(&child->sighand->siglock); } @@ -82,7 +77,8 @@ void __ptrace_unlink(task_t *child) add_parent(child); } - ptrace_untrace(child); + if (child->state == TASK_TRACED) + ptrace_untrace(child); } /* diff --git a/kernel/sched.c b/kernel/sched.c index a9ecac398bb9..365f0b90b4de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -665,11 +665,57 @@ static int effective_prio(task_t *p) } /* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired, and switch periodically + * regardless, to ensure that highly interactive tasks do not starve + * the less fortunate for unreasonably long periods. + */ +static inline int expired_starving(runqueue_t *rq) +{ + int limit; + + /* + * Arrays were recently switched, all is well + */ + if (!rq->expired_timestamp) + return 0; + + limit = STARVATION_LIMIT * rq->nr_running; + + /* + * It's time to switch arrays + */ + if (jiffies - rq->expired_timestamp >= limit) + return 1; + + /* + * There's a better selection in the expired array + */ + if (rq->curr->static_prio > rq->best_expired_prio) + return 1; + + /* + * All is well + */ + return 0; +} + +/* * __activate_task - move a task to the runqueue. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq->active); + prio_array_t *target = rq->active; + + if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p)))) + target = rq->expired; + enqueue_task(p, target); rq->nr_running++; } @@ -688,7 +734,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) unsigned long long __sleep_time = now - p->timestamp; unsigned long sleep_time; - if (unlikely(p->policy == SCHED_BATCH)) + if (batch_task(p)) sleep_time = 0; else { if (__sleep_time > NS_MAX_SLEEP_AVG) @@ -700,21 +746,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now) if (likely(sleep_time > 0)) { /* * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. + * idle. They will only have their sleep_avg increased to a + * level that makes them just interactive priority to stay + * active yet prevent them suddenly becoming cpu hogs and + * starving other processes. */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); + if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { + unsigned long ceiling; + + ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - + DEF_TIMESLICE); + if (p->sleep_avg < ceiling) + p->sleep_avg = ceiling; } else { /* * Tasks waking from uninterruptible sleep are * limited in their sleep_avg rise as they * are likely to be waiting on I/O */ - if (p->activated == -1 && p->mm) { + if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) sleep_time = 0; else if (p->sleep_avg + sleep_time >= @@ -769,7 +819,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) * This checks to make sure it's not an uninterruptible task * that is now waking up. */ - if (!p->activated) { + if (p->sleep_type == SLEEP_NORMAL) { /* * Tasks which were woken up by interrupts (ie. hw events) * are most likely of interactive nature. So we give them @@ -778,13 +828,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) * on a CPU, first time around: */ if (in_interrupt()) - p->activated = 2; + p->sleep_type = SLEEP_INTERRUPTED; else { /* * Normal first-time wakeups get a credit too for * on-runqueue time, but it will be weighted down: */ - p->activated = 1; + p->sleep_type = SLEEP_INTERACTIVE; } } p->timestamp = now; @@ -1272,19 +1322,19 @@ out_activate: * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. */ - p->activated = -1; - } + p->sleep_type = SLEEP_NONINTERACTIVE; + } else /* * Tasks that have marked their sleep as noninteractive get - * woken up without updating their sleep average. (i.e. their - * sleep is handled in a priority-neutral manner, no priority - * boost and no penalty.) + * woken up with their sleep average not weighted in an + * interactive way. */ - if (old_state & TASK_NONINTERACTIVE) - __activate_task(p, rq); - else - activate_task(p, rq, cpu == this_cpu); + if (old_state & TASK_NONINTERACTIVE) + p->sleep_type = SLEEP_NONINTERACTIVE; + + + activate_task(p, rq, cpu == this_cpu); /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1658,6 +1708,21 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_active(void) +{ + unsigned long i, running = 0, uninterruptible = 0; + + for_each_online_cpu(i) { + running += cpu_rq(i)->nr_running; + uninterruptible += cpu_rq(i)->nr_uninterruptible; + } + + if (unlikely((long)uninterruptible < 0)) + uninterruptible = 0; + + return running + uninterruptible; +} + #ifdef CONFIG_SMP /* @@ -2467,22 +2532,6 @@ unsigned long long current_sched_time(const task_t *tsk) } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -2617,7 +2666,7 @@ void scheduler_tick(void) if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { enqueue_task(p, rq->expired); if (p->static_prio < rq->best_expired_prio) rq->best_expired_prio = p->static_prio; @@ -2860,6 +2909,12 @@ EXPORT_SYMBOL(sub_preempt_count); #endif +static inline int interactive_sleep(enum sleep_type sleep_type) +{ + return (sleep_type == SLEEP_INTERACTIVE || + sleep_type == SLEEP_INTERRUPTED); +} + /* * schedule() is the main scheduler function. */ @@ -2983,12 +3038,12 @@ go_idle: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (!rt_task(next) && next->activated > 0) { + if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; if (unlikely((long long)(now - next->timestamp) < 0)) delta = 0; - if (next->activated == 1) + if (next->sleep_type == SLEEP_INTERACTIVE) delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; array = next->array; @@ -2998,10 +3053,9 @@ go_idle: dequeue_task(next, array); next->prio = new_prio; enqueue_task(next, array); - } else - requeue_task(next, array); + } } - next->activated = 0; + next->sleep_type = SLEEP_NORMAL; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); diff --git a/kernel/signal.c b/kernel/signal.c index 4922928d91f6..e5f8aea78ffe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -769,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) { int ret = 0; - if (!irqs_disabled()) - BUG(); + BUG_ON(!irqs_disabled()); assert_spin_locked(&t->sighand->siglock); /* Short-circuit ignored signals. */ @@ -869,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p) if (t == NULL) /* restart balancing at this thread */ t = p->signal->curr_target = p; - BUG_ON(t->tgid != p->tgid); while (!wants_signal(sig, t)) { t = next_thread(t); @@ -1384,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) * the overrun count. Other uses should not try to * send the signal multiple times. */ - if (q->info.si_code != SI_TIMER) - BUG(); + BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; goto out; } @@ -1560,6 +1557,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) /* Let the debugger run. */ set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); + try_to_freeze(); read_lock(&tasklist_lock); if (likely(current->ptrace & PT_PTRACED) && likely(current->parent != current->real_parent || @@ -1756,9 +1754,9 @@ relock: /* Let the debugger run. */ ptrace_stop(signr, signr, info); - /* We're back. Did the debugger cancel the sig or group_exit? */ + /* We're back. Did the debugger cancel the sig? */ signr = current->exit_code; - if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) + if (signr == 0) continue; current->exit_code = 0; diff --git a/kernel/sys.c b/kernel/sys.c index 7ef7f6054c28..0b6ec0e7936f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1372,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid) asmlinkage long sys_setsid(void) { struct task_struct *group_leader = current->group_leader; - struct pid *pid; + pid_t session; int err = -EPERM; mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); - pid = find_pid(PIDTYPE_PGID, group_leader->pid); - if (pid) + /* Fail if I am already a session leader */ + if (group_leader->signal->leader) + goto out; + + session = group_leader->pid; + /* Fail if a process group id already exists that equals the + * proposed session id. + * + * Don't check if session id == 1 because kernel threads use this + * session id and so the check will always fail and make it so + * init cannot successfully call setsid. + */ + if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) goto out; group_leader->signal->leader = 1; - __set_special_pids(group_leader->pid, group_leader->pid); + __set_special_pids(session, session); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; err = process_group(group_leader); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d82864c4a617..5433195040f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -120,3 +120,15 @@ cond_syscall(sys32_sysctl); cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); + +/* mmu depending weak syscall entries */ +cond_syscall(sys_mprotect); +cond_syscall(sys_msync); +cond_syscall(sys_mlock); +cond_syscall(sys_munlock); +cond_syscall(sys_mlockall); +cond_syscall(sys_munlockall); +cond_syscall(sys_mincore); +cond_syscall(sys_madvise); +cond_syscall(sys_mremap); +cond_syscall(sys_remap_file_pages); diff --git a/kernel/time.c b/kernel/time.c index ff8e7019c4c4..b00ddc71cedb 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -410,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time); * current_fs_time - Return FS time * @sb: Superblock. * - * Return the current time truncated to the time granuality supported by + * Return the current time truncated to the time granularity supported by * the fs. */ struct timespec current_fs_time(struct super_block *sb) @@ -421,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb) EXPORT_SYMBOL(current_fs_time); /** - * timespec_trunc - Truncate timespec to a granuality + * timespec_trunc - Truncate timespec to a granularity * @t: Timespec - * @gran: Granuality in ns. + * @gran: Granularity in ns. * - * Truncate a timespec to a granuality. gran must be smaller than a second. + * Truncate a timespec to a granularity. gran must be smaller than a second. * Always rounds down. * * This function should be only used for timestamps returned by diff --git a/kernel/timer.c b/kernel/timer.c index ab189dd187cb..883773788836 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64); /* * per-CPU timer vector definitions: */ - #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) #define TVN_SIZE (1 << TVN_BITS) @@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64); #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) -struct timer_base_s { - spinlock_t lock; - struct timer_list *running_timer; -}; - typedef struct tvec_s { struct list_head vec[TVN_SIZE]; } tvec_t; @@ -76,7 +70,8 @@ typedef struct tvec_root_s { } tvec_root_t; struct tvec_t_base_s { - struct timer_base_s t_base; + spinlock_t lock; + struct timer_list *running_timer; unsigned long timer_jiffies; tvec_root_t tv1; tvec_t tv2; @@ -86,14 +81,16 @@ struct tvec_t_base_s { } ____cacheline_aligned_in_smp; typedef struct tvec_t_base_s tvec_base_t; -static DEFINE_PER_CPU(tvec_base_t *, tvec_bases); -static tvec_base_t boot_tvec_bases; + +tvec_base_t boot_tvec_bases; +EXPORT_SYMBOL(boot_tvec_bases); +static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { #ifdef CONFIG_SMP - base->t_base.running_timer = timer; + base->running_timer = timer; #endif } @@ -139,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } -typedef struct timer_base_s timer_base_t; -/* - * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) - * at compile time, and we need timer->base to lock the timer. - */ -timer_base_t __init_timer_base - ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; -EXPORT_SYMBOL(__init_timer_base); - /*** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -158,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base); void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base; + timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); } EXPORT_SYMBOL(init_timer); @@ -174,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer, } /* - * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock + * We are using hashed locking: holding per_cpu(tvec_bases).lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * @@ -185,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer, * possible to set timer->base = NULL and drop the lock: the timer remains * locked. */ -static timer_base_t *lock_timer_base(struct timer_list *timer, +static tvec_base_t *lock_timer_base(struct timer_list *timer, unsigned long *flags) { - timer_base_t *base; + tvec_base_t *base; for (;;) { base = timer->base; @@ -205,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer, int __mod_timer(struct timer_list *timer, unsigned long expires) { - timer_base_t *base; - tvec_base_t *new_base; + tvec_base_t *base, *new_base; unsigned long flags; int ret = 0; @@ -221,7 +208,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) new_base = __get_cpu_var(tvec_bases); - if (base != &new_base->t_base) { + if (base != new_base) { /* * We are trying to schedule the timer on the local CPU. * However we can't change timer's base while it is running, @@ -229,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) * handler yet has not finished. This also guarantees that * the timer is serialized wrt itself. */ - if (unlikely(base->running_timer == timer)) { - /* The timer remains on a former base */ - new_base = container_of(base, tvec_base_t, t_base); - } else { + if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ timer->base = NULL; spin_unlock(&base->lock); - spin_lock(&new_base->t_base.lock); - timer->base = &new_base->t_base; + base = new_base; + spin_lock(&base->lock); + timer->base = base; } } timer->expires = expires; - internal_add_timer(new_base, timer); - spin_unlock_irqrestore(&new_base->t_base.lock, flags); + internal_add_timer(base, timer); + spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -263,10 +248,10 @@ void add_timer_on(struct timer_list *timer, int cpu) unsigned long flags; BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->t_base.lock, flags); - timer->base = &base->t_base; + spin_lock_irqsave(&base->lock, flags); + timer->base = base; internal_add_timer(base, timer); - spin_unlock_irqrestore(&base->t_base.lock, flags); + spin_unlock_irqrestore(&base->lock, flags); } @@ -319,7 +304,7 @@ EXPORT_SYMBOL(mod_timer); */ int del_timer(struct timer_list *timer) { - timer_base_t *base; + tvec_base_t *base; unsigned long flags; int ret = 0; @@ -346,7 +331,7 @@ EXPORT_SYMBOL(del_timer); */ int try_to_del_timer_sync(struct timer_list *timer) { - timer_base_t *base; + tvec_base_t *base; unsigned long flags; int ret = -1; @@ -410,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) struct timer_list *tmp; tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != &base->t_base); + BUG_ON(tmp->base != base); curr = curr->next; internal_add_timer(base, tmp); } @@ -432,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; - spin_lock_irq(&base->t_base.lock); + spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; @@ -458,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base) set_running_timer(base, timer); detach_timer(timer, 1); - spin_unlock_irq(&base->t_base.lock); + spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); fn(data); @@ -471,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base) BUG(); } } - spin_lock_irq(&base->t_base.lock); + spin_lock_irq(&base->lock); } } set_running_timer(base, NULL); - spin_unlock_irq(&base->t_base.lock); + spin_unlock_irq(&base->lock); } #ifdef CONFIG_NO_IDLE_HZ @@ -506,7 +491,7 @@ unsigned long next_timer_interrupt(void) hr_expires += jiffies; base = __get_cpu_var(tvec_bases); - spin_lock(&base->t_base.lock); + spin_lock(&base->lock); expires = base->timer_jiffies + (LONG_MAX >> 1); list = NULL; @@ -554,7 +539,7 @@ found: expires = nte->expires; } } - spin_unlock(&base->t_base.lock); + spin_unlock(&base->lock); if (time_before(hr_expires, expires)) return hr_expires; @@ -841,7 +826,7 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { - return (nr_running() + nr_uninterruptible()) * FIXED_1; + return nr_active() * FIXED_1; } /* @@ -1240,29 +1225,37 @@ static int __devinit init_timers_cpu(int cpu) { int j; tvec_base_t *base; + static char __devinitdata tvec_base_done[NR_CPUS]; - base = per_cpu(tvec_bases, cpu); - if (!base) { + if (!tvec_base_done[cpu]) { static char boot_done; - /* - * Cannot do allocation in init_timers as that runs before the - * allocator initializes (and would waste memory if there are - * more possible CPUs than will ever be installed/brought up). - */ if (boot_done) { + /* + * The APs use this path later in boot + */ base = kmalloc_node(sizeof(*base), GFP_KERNEL, cpu_to_node(cpu)); if (!base) return -ENOMEM; memset(base, 0, sizeof(*base)); + per_cpu(tvec_bases, cpu) = base; } else { - base = &boot_tvec_bases; + /* + * This is for the boot CPU - we use compile-time + * static initialisation because per-cpu memory isn't + * ready yet and because the memory allocators are not + * initialised either. + */ boot_done = 1; + base = &boot_tvec_bases; } - per_cpu(tvec_bases, cpu) = base; + tvec_base_done[cpu] = 1; + } else { + base = per_cpu(tvec_bases, cpu); } - spin_lock_init(&base->t_base.lock); + + spin_lock_init(&base->lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); @@ -1284,7 +1277,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) while (!list_empty(head)) { timer = list_entry(head->next, struct timer_list, entry); detach_timer(timer, 0); - timer->base = &new_base->t_base; + timer->base = new_base; internal_add_timer(new_base, timer); } } @@ -1300,11 +1293,11 @@ static void __devinit migrate_timers(int cpu) new_base = get_cpu_var(tvec_bases); local_irq_disable(); - spin_lock(&new_base->t_base.lock); - spin_lock(&old_base->t_base.lock); + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + + BUG_ON(old_base->running_timer); - if (old_base->t_base.running_timer) - BUG(); for (i = 0; i < TVR_SIZE; i++) migrate_timer_list(new_base, old_base->tv1.vec + i); for (i = 0; i < TVN_SIZE; i++) { @@ -1314,8 +1307,8 @@ static void __devinit migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } - spin_unlock(&old_base->t_base.lock); - spin_unlock(&new_base->t_base.lock); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(tvec_bases); } @@ -1471,7 +1464,7 @@ static void time_interpolator_update(long delta_nsec) */ if (jiffies % INTERPOLATOR_ADJUST == 0) { - if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) + if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec) time_interpolator->nsec_per_cyc--; if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) time_interpolator->nsec_per_cyc++; @@ -1495,8 +1488,7 @@ register_time_interpolator(struct time_interpolator *ti) unsigned long flags; /* Sanity check */ - if (ti->frequency == 0 || ti->mask == 0) - BUG(); + BUG_ON(ti->frequency == 0 || ti->mask == 0); ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; spin_lock(&time_interpolator_lock); |