diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 1 | ||||
-rw-r--r-- | kernel/cpuset.c | 466 | ||||
-rw-r--r-- | kernel/exit.c | 29 | ||||
-rw-r--r-- | kernel/irq/handle.c | 6 | ||||
-rw-r--r-- | kernel/kallsyms.c | 1 | ||||
-rw-r--r-- | kernel/kmod.c | 6 | ||||
-rw-r--r-- | kernel/kprobes.c | 1 | ||||
-rw-r--r-- | kernel/kthread.c | 13 | ||||
-rw-r--r-- | kernel/params.c | 1 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 16 | ||||
-rw-r--r-- | kernel/posix-timers.c | 19 | ||||
-rw-r--r-- | kernel/power/Makefile | 2 | ||||
-rw-r--r-- | kernel/power/disk.c | 22 | ||||
-rw-r--r-- | kernel/power/main.c | 5 | ||||
-rw-r--r-- | kernel/power/power.h | 17 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 435 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 576 | ||||
-rw-r--r-- | kernel/printk.c | 78 | ||||
-rw-r--r-- | kernel/ptrace.c | 7 | ||||
-rw-r--r-- | kernel/rcupdate.c | 10 | ||||
-rw-r--r-- | kernel/rcutorture.c | 492 | ||||
-rw-r--r-- | kernel/sched.c | 3 | ||||
-rw-r--r-- | kernel/signal.c | 148 | ||||
-rw-r--r-- | kernel/time.c | 25 | ||||
-rw-r--r-- | kernel/timer.c | 342 | ||||
-rw-r--r-- | kernel/workqueue.c | 33 |
27 files changed, 1631 insertions, 1125 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ff4dc02ce170..4f5a1453093a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o @@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12e..3619e939182e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,7 @@ /* This protects CPUs going up and down... */ DECLARE_MUTEX(cpucontrol); +EXPORT_SYMBOL_GPL(cpucontrol); static struct notifier_block *cpu_chain; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28176d083f7b..5a737ed9dac7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -32,6 +32,7 @@ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/list.h> +#include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mount.h> @@ -60,6 +61,9 @@ struct cpuset { cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + /* + * Count is atomic so can incr (fork) or decr (exit) without a lock. + */ atomic_t count; /* count tasks using this cpuset */ /* @@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; static struct super_block *cpuset_sb = NULL; /* - * cpuset_sem should be held by anyone who is depending on the children - * or sibling lists of any cpuset, or performing non-atomic operations - * on the flags or *_allowed values of a cpuset, such as raising the - * CS_REMOVED flag bit iff it is not already raised, or reading and - * conditionally modifying the *_allowed values. One kernel global - * cpuset semaphore should be sufficient - these things don't change - * that much. - * - * The code that modifies cpusets holds cpuset_sem across the entire - * operation, from cpuset_common_file_write() down, single threading - * all cpuset modifications (except for counter manipulations from - * fork and exit) across the system. This presumes that cpuset - * modifications are rare - better kept simple and safe, even if slow. - * - * The code that reads cpusets, such as in cpuset_common_file_read() - * and below, only holds cpuset_sem across small pieces of code, such - * as when reading out possibly multi-word cpumasks and nodemasks, as - * the risks are less, and the desire for performance a little greater. - * The proc_cpuset_show() routine needs to hold cpuset_sem to insure - * that no cs->dentry is NULL, as it walks up the cpuset tree to root. - * - * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't - * (usually) grab cpuset_sem. These are the two most performance - * critical pieces of code here. The exception occurs on exit(), - * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * We have two global cpuset semaphores below. They can nest. + * It is ok to first take manage_sem, then nest callback_sem. We also + * require taking task_lock() when dereferencing a tasks cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both semaphores to modify cpusets. If a task + * holds manage_sem, then it blocks others wanting that semaphore, + * ensuring that it is the only task able to also acquire callback_sem + * and be able to modify cpusets. It can perform various checks on + * the cpuset structure first, knowing nothing will change. It can + * also allocate memory while just holding manage_sem. While it is + * performing these checks, various callback routines can briefly + * acquire callback_sem to query cpusets. Once it is ready to make + * the changes, it takes callback_sem, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_sem, as that would risk double tripping on callback_sem + * from one of the callbacks into the cpuset code from within + * __alloc_pages(). + * + * If a task is only holding callback_sem, then it has read-only + * access to cpusets. + * + * The task_struct fields mems_allowed and mems_generation may only + * be accessed in the context of that task, so require no locks. + * + * Any task can increment and decrement the count field without lock. + * So in general, code holding manage_sem or callback_sem can't rely + * on the count field not changing. However, if the count goes to + * zero, then only attach_task(), which holds both semaphores, can + * increment it again. Because a count of zero means that no tasks + * are currently attached, therefore there is no way a task attached + * to that cpuset can fork (the other way to increment the count). + * So code holding manage_sem or callback_sem can safely assume that + * if the count is zero, it will stay zero. Similarly, if a task + * holds manage_sem or callback_sem on a cpuset with zero count, it + * knows that the cpuset won't be removed, as cpuset_rmdir() needs + * both of those semaphores. + * + * A possible optimization to improve parallelism would be to make + * callback_sem a R/W semaphore (rwsem), allowing the callback routines + * to proceed in parallel, with read access, until the holder of + * manage_sem needed to take this rwsem for exclusive write access + * and modify some cpusets. + * + * The cpuset_common_file_write handler for operations that modify + * the cpuset hierarchy holds manage_sem across the entire operation, + * single threading all such cpuset modifications across the system. + * + * The cpuset_common_file_read() handlers only hold callback_sem across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + * + * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't + * (usually) take either semaphore. These are the two most performance + * critical pieces of code here. The exception occurs on cpuset_exit(), + * when a task in a notify_on_release cpuset exits. Then manage_sem * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * - * A cpuset can only be deleted if both its 'count' of using tasks is - * zero, and its list of 'children' cpusets is empty. Since all tasks - * in the system use _some_ cpuset, and since there is always at least - * one task in the system (init, pid == 1), therefore, top_cpuset - * always has either children cpusets and/or using tasks. So no need - * for any special hack to ensure that top_cpuset cannot be deleted. + * A cpuset can only be deleted if both its 'count' of using tasks + * is zero, and its list of 'children' cpusets is empty. Since all + * tasks in the system use _some_ cpuset, and since there is always at + * least one task in the system (init, pid == 1), therefore, top_cpuset + * always has either children cpusets and/or using tasks. So we don't + * need a special hack to ensure that top_cpuset cannot be deleted. + * + * The above "Tale of Two Semaphores" would be complete, but for: + * + * The task_lock() exception + * + * The need for this exception arises from the action of attach_task(), + * which overwrites one tasks cpuset pointer with another. It does + * so using both semaphores, however there are several performance + * critical places that need to reference task->cpuset without the + * expense of grabbing a system global semaphore. Therefore except as + * noted below, when dereferencing or, as in attach_task(), modifying + * a tasks cpuset pointer we use task_lock(), which acts on a spinlock + * (task->alloc_lock) already in the task_struct routinely used for + * such matters. */ -static DECLARE_MUTEX(cpuset_sem); -static struct task_struct *cpuset_sem_owner; -static int cpuset_sem_depth; - -/* - * The global cpuset semaphore cpuset_sem can be needed by the - * memory allocator to update a tasks mems_allowed (see the calls - * to cpuset_update_current_mems_allowed()) or to walk up the - * cpuset hierarchy to find a mem_exclusive cpuset see the calls - * to cpuset_excl_nodes_overlap()). - * - * But if the memory allocation is being done by cpuset.c code, it - * usually already holds cpuset_sem. Double tripping on a kernel - * semaphore deadlocks the current task, and any other task that - * subsequently tries to obtain the lock. - * - * Run all up's and down's on cpuset_sem through the following - * wrappers, which will detect this nested locking, and avoid - * deadlocking. - */ - -static inline void cpuset_down(struct semaphore *psem) -{ - if (cpuset_sem_owner != current) { - down(psem); - cpuset_sem_owner = current; - } - cpuset_sem_depth++; -} - -static inline void cpuset_up(struct semaphore *psem) -{ - if (--cpuset_sem_depth == 0) { - cpuset_sem_owner = NULL; - up(psem); - } -} +static DECLARE_MUTEX(manage_sem); +static DECLARE_MUTEX(callback_sem); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) } /* - * Call with cpuset_sem held. Writes path of cpuset into buf. + * Call with manage_sem held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */ @@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * status of the /sbin/cpuset_release_agent task, so no sense holding * our caller up for that. * - * The simple act of forking that task might require more memory, - * which might need cpuset_sem. So this routine must be called while - * cpuset_sem is not held, to avoid a possible deadlock. See also - * comments for check_for_release(), below. + * When we had only one cpuset semaphore, we had to call this + * without holding it, to avoid deadlock when call_usermodehelper() + * allocated memory. With two locks, we could now call this while + * holding manage_sem, but we still don't, so as to minimize + * the time manage_sem is held. */ static void cpuset_release_agent(const char *pathbuf) @@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) * cs is notify_on_release() and now both the user count is zero and * the list of children is empty, prepare cpuset path in a kmalloc'd * buffer, to be returned via ppathbuf, so that the caller can invoke - * cpuset_release_agent() with it later on, once cpuset_sem is dropped. - * Call here with cpuset_sem held. + * cpuset_release_agent() with it later on, once manage_sem is dropped. + * Call here with manage_sem held. * * This check_for_release() routine is responsible for kmalloc'ing * pathbuf. The above cpuset_release_agent() is responsible for * kfree'ing pathbuf. The caller of these routines is responsible * for providing a pathbuf pointer, initialized to NULL, then - * calling check_for_release() with cpuset_sem held and the address - * of the pathbuf pointer, then dropping cpuset_sem, then calling + * calling check_for_release() with manage_sem held and the address + * of the pathbuf pointer, then dropping manage_sem, then calling * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ @@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) @@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) * One way or another, we guarantee to return some non-empty subset * of node_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) @@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) } /* - * Refresh current tasks mems_allowed and mems_generation from - * current tasks cpuset. Call with cpuset_sem held. + * Refresh current tasks mems_allowed and mems_generation from current + * tasks cpuset. * - * This routine is needed to update the per-task mems_allowed - * data, within the tasks context, when it is trying to allocate - * memory (in various mm/mempolicy.c routines) and notices - * that some other task has been modifying its cpuset. + * Call without callback_sem or task_lock() held. May be called with + * or without manage_sem held. Will acquire task_lock() and might + * acquire callback_sem during call. + * + * The task_lock() is required to dereference current->cpuset safely. + * Without it, we could pick up the pointer value of current->cpuset + * in one instruction, and then attach_task could give us a different + * cpuset, and then the cpuset we had could be removed and freed, + * and then on our next instruction, we could dereference a no longer + * valid cpuset pointer to get its mems_generation field. + * + * This routine is needed to update the per-task mems_allowed data, + * within the tasks context, when it is trying to allocate memory + * (in various mm/mempolicy.c routines) and notices that some other + * task has been modifying its cpuset. */ static void refresh_mems(void) { - struct cpuset *cs = current->cpuset; + int my_cpusets_mem_gen; + + task_lock(current); + my_cpusets_mem_gen = current->cpuset->mems_generation; + task_unlock(current); - if (current->cpuset_mems_generation != cs->mems_generation) { + if (current->cpuset_mems_generation != my_cpusets_mem_gen) { + struct cpuset *cs; + nodemask_t oldmem = current->mems_allowed; + + down(&callback_sem); + task_lock(current); + cs = current->cpuset; guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; + task_unlock(current); + up(&callback_sem); + if (!nodes_equal(oldmem, current->mems_allowed)) + numa_policy_rebind(&oldmem, ¤t->mems_allowed); } } @@ -579,7 +620,7 @@ static void refresh_mems(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. + * are only set if the other's are set. Call holding manage_sem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_sem held. + * manage_sem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * - * Call with cpuset_sem held. May nest a call to the + * Call with manage_sem held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ @@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) unlock_cpu_hotplug(); } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + down(&callback_sem); cs->cpus_allowed = trialcs.cpus_allowed; + up(&callback_sem); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0; } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval == 0) { + down(&callback_sem); cs->mems_allowed = trialcs.mems_allowed; atomic_inc(&cpuset_mems_generation); cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); } return retval; } @@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) * CS_NOTIFY_ON_RELEASE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 + * + * Call with manage_sem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); + down(&callback_sem); if (turning_on) set_bit(bit, &cs->flags); else clear_bit(bit, &cs->flags); + up(&callback_sem); if (cpu_exclusive_changed) update_cpu_domains(cs); return 0; } +/* + * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly + * writing the path of the old cpuset in 'ppathbuf' if it needs to be + * notified on release. + * + * Call holding manage_sem. May take callback_sem and task_lock of + * the task 'pid' during call. + */ + static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; @@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); - if (!tsk) { + if (!tsk || tsk->flags & PF_EXITING) { read_unlock(&tasklist_lock); return -ESRCH; } @@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + down(&callback_sem); + task_lock(tsk); oldcs = tsk->cpuset; if (!oldcs) { task_unlock(tsk); + up(&callback_sem); put_task_struct(tsk); return -ESRCH; } @@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + up(&callback_sem); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); @@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - cpuset_down(&cpuset_sem); + down(&manage_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - cpuset_up(&cpuset_sem); + up(&manage_sem); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); mask = cs->cpus_allowed; - cpuset_up(&cpuset_sem); + up(&callback_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); mask = cs->mems_allowed; - cpuset_up(&cpuset_sem); + up(&callback_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -995,7 +1065,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, goto out; } *s++ = '\n'; - *s = '\0'; retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); out: @@ -1048,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) return 0; } +/* + * cpuset_rename - Only allow simple rename of directories in place. + */ +static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) + return -ENOTDIR; + if (new_dentry->d_inode) + return -EEXIST; + if (old_dir != new_dir) + return -EIO; + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, @@ -1060,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cpuset_mkdir, .rmdir = cpuset_rmdir, + .rename = cpuset_rename, }; static int cpuset_create_file(struct dentry *dentry, int mode) @@ -1163,7 +1248,9 @@ struct ctr_struct { /* * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. - * Return actual number of pids loaded. + * Return actual number of pids loaded. No need to task_lock(p) + * when reading out p->cpuset, as we don't really care if it changes + * on the next cycle, and we are not going to try to dereference it. */ static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) { @@ -1205,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) return cnt; } +/* + * Handle an open on 'tasks' file. Prepare a buffer listing the + * process id's of tasks currently attached to the cpuset being opened. + * + * Does not require any specific cpuset semaphores, and does not take any. + */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { struct cpuset *cs = __d_cs(file->f_dentry->d_parent); @@ -1352,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - cpuset_down(&cpuset_sem); + down(&manage_sem); + refresh_mems(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1366,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->parent = parent; + down(&callback_sem); list_add(&cs->sibling, &cs->parent->children); + up(&callback_sem); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* - * Release cpuset_sem before cpuset_populate_dir() because it + * Release manage_sem before cpuset_populate_dir() because it * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - cpuset_up(&cpuset_sem); + up(&manage_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - cpuset_up(&cpuset_sem); + up(&manage_sem); kfree(cs); return err; } @@ -1406,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_sem already */ - cpuset_down(&cpuset_sem); + down(&manage_sem); + refresh_mems(); if (atomic_read(&cs->count) > 0) { - cpuset_up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - cpuset_up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } parent = cs->parent; + down(&callback_sem); set_bit(CS_REMOVED, &cs->flags); if (is_cpu_exclusive(cs)) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ - if (list_empty(&parent->children)) - check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - cpuset_up(&cpuset_sem); + up(&callback_sem); + if (list_empty(&parent->children)) + check_for_release(parent, &pathbuf); + up(&manage_sem); cpuset_release_agent(pathbuf); return 0; } @@ -1488,16 +1587,26 @@ void __init cpuset_init_smp(void) * cpuset_fork - attach newly forked task to its parents cpuset. * @tsk: pointer to task_struct of forking parent process. * - * Description: By default, on fork, a task inherits its - * parent's cpuset. The pointer to the shared cpuset is - * automatically copied in fork.c by dup_task_struct(). - * This cpuset_fork() routine need only increment the usage - * counter in that cpuset. + * Description: A task inherits its parent's cpuset at fork(). + * + * A pointer to the shared cpuset was automatically copied in fork.c + * by dup_task_struct(). However, we ignore that copy, since it was + * not made under the protection of task_lock(), so might no longer be + * a valid cpuset pointer. attach_task() might have already changed + * current->cpuset, allowing the previously referenced cpuset to + * be removed and freed. Instead, we task_lock(current) and copy + * its present value of current->cpuset for our freshly forked child. + * + * At the point that cpuset_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. **/ -void cpuset_fork(struct task_struct *tsk) +void cpuset_fork(struct task_struct *child) { - atomic_inc(&tsk->cpuset->count); + task_lock(current); + child->cpuset = current->cpuset; + atomic_inc(&child->cpuset->count); + task_unlock(current); } /** @@ -1506,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * - * Note that cpusets marked notify_on_release force every task - * in them to take the global cpuset_sem semaphore when exiting. - * This could impact scaling on very large systems. Be reluctant - * to use notify_on_release cpusets where very high task exit - * scaling is required on large systems. - * - * Don't even think about derefencing 'cs' after the cpuset use - * count goes to zero, except inside a critical section guarded - * by the cpuset_sem semaphore. If you don't hold cpuset_sem, - * then a zero cpuset use count is a license to any other task to - * nuke the cpuset immediately. + * Note that cpusets marked notify_on_release force every task in + * them to take the global manage_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant to + * use notify_on_release cpusets where very high task exit scaling + * is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use count + * goes to zero, except inside a critical section guarded by manage_sem + * or callback_sem. Otherwise a zero cpuset use count is a license to + * any other task to nuke the cpuset immediately, via cpuset_rmdir(). + * + * This routine has to take manage_sem, not callback_sem, because + * it is holding that semaphore while calling check_for_release(), + * which calls kmalloc(), so can't be called holding callback__sem(). + * + * We don't need to task_lock() this reference to tsk->cpuset, + * because tsk is already marked PF_EXITING, so attach_task() won't + * mess with it. **/ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; - task_lock(tsk); + BUG_ON(!(tsk->flags & PF_EXITING)); + cs = tsk->cpuset; tsk->cpuset = NULL; - task_unlock(tsk); if (notify_on_release(cs)) { char *pathbuf = NULL; - cpuset_down(&cpuset_sem); + down(&manage_sem); if (atomic_dec_and_test(&cs->count)) check_for_release(cs, &pathbuf); - cpuset_up(&cpuset_sem); + up(&manage_sem); cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); @@ -1555,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) { cpumask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); task_lock((struct task_struct *)tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); - cpuset_up(&cpuset_sem); + up(&callback_sem); return mask; } @@ -1575,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) * If the current tasks cpusets mems_allowed changed behind our backs, * update current->mems_allowed and mems_generation to the new value. * Do not call this routine if in_interrupt(). + * + * Call without callback_sem or task_lock() held. May be called + * with or without manage_sem held. Unless exiting, it will acquire + * task_lock(). Also might acquire callback_sem during call to + * refresh_mems(). */ void cpuset_update_current_mems_allowed(void) { - struct cpuset *cs = current->cpuset; + struct cpuset *cs; + int need_to_refresh = 0; + task_lock(current); + cs = current->cpuset; if (!cs) - return; /* task is exiting */ - if (current->cpuset_mems_generation != cs->mems_generation) { - cpuset_down(&cpuset_sem); + goto done; + if (current->cpuset_mems_generation != cs->mems_generation) + need_to_refresh = 1; +done: + task_unlock(current); + if (need_to_refresh) refresh_mems(); - cpuset_up(&cpuset_sem); - } } /** @@ -1621,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) /* * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive - * ancestor to the specified cpuset. Call while holding cpuset_sem. + * ancestor to the specified cpuset. Call holding callback_sem. * If no ancestor is mem_exclusive (an unusual configuration), then * returns the root cpuset. */ @@ -1648,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() + * Scanning up parent cpusets requires callback_sem. The __alloc_pages() * routine only calls here with __GFP_HARDWALL bit _not_ set if * it's a GFP_KERNEL allocation, and all nodes in the current tasks * mems_allowed came up empty on the first pass over the zonelist. * So only GFP_KERNEL allocations, if all nodes in the cpuset are - * short of memory, might require taking the cpuset_sem semaphore. + * short of memory, might require taking the callback_sem semaphore. * * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing @@ -1685,14 +1810,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return 0; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - cpuset_down(&cpuset_sem); - cs = current->cpuset; - if (!cs) - goto done; /* current task exiting */ - cs = nearest_exclusive_ancestor(cs); + down(&callback_sem); + + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return 1; + task_lock(current); + cs = nearest_exclusive_ancestor(current->cpuset); + task_unlock(current); + allowed = node_isset(node, cs->mems_allowed); -done: - cpuset_up(&cpuset_sem); + up(&callback_sem); return allowed; } @@ -1705,7 +1832,7 @@ done: * determine if task @p's memory usage might impact the memory * available to the current task. * - * Acquires cpuset_sem - not suitable for calling from a fast path. + * Acquires callback_sem - not suitable for calling from a fast path. **/ int cpuset_excl_nodes_overlap(const struct task_struct *p) @@ -1713,18 +1840,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ int overlap = 0; /* do cpusets overlap? */ - cpuset_down(&cpuset_sem); - cs1 = current->cpuset; - if (!cs1) - goto done; /* current task exiting */ - cs2 = p->cpuset; - if (!cs2) - goto done; /* task p is exiting */ - cs1 = nearest_exclusive_ancestor(cs1); - cs2 = nearest_exclusive_ancestor(cs2); + down(&callback_sem); + + task_lock(current); + if (current->flags & PF_EXITING) { + task_unlock(current); + goto done; + } + cs1 = nearest_exclusive_ancestor(current->cpuset); + task_unlock(current); + + task_lock((struct task_struct *)p); + if (p->flags & PF_EXITING) { + task_unlock((struct task_struct *)p); + goto done; + } + cs2 = nearest_exclusive_ancestor(p->cpuset); + task_unlock((struct task_struct *)p); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); done: - cpuset_up(&cpuset_sem); + up(&callback_sem); return overlap; } @@ -1733,6 +1869,10 @@ done: * proc_cpuset_show() * - Print tasks cpuset path into seq_file. * - Used for /proc/<pid>/cpuset. + * - No need to task_lock(tsk) on this tsk->cpuset reference, as it + * doesn't really matter if tsk->cpuset changes after we read it, + * and we take manage_sem, keeping attach_task() from changing it + * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *v) @@ -1747,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - cpuset_down(&cpuset_sem); - task_lock(tsk); + down(&manage_sem); cs = tsk->cpuset; - task_unlock(tsk); if (!cs) { retval = -EINVAL; goto out; @@ -1762,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - cpuset_up(&cpuset_sem); + up(&manage_sem); kfree(buf); return retval; } diff --git a/kernel/exit.c b/kernel/exit.c index 79f52b85d6ed..537394b25e8d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) if (p->pdeath_signal) /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, (void *) 0, p); + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) { @@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { - __kill_pg_info(SIGHUP, (void *)1, pgrp); - __kill_pg_info(SIGCONT, (void *)1, pgrp); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } } @@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk) (t->signal->session == tsk->signal->session) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { - __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); - __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); } /* Let father know we died @@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk) /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -873,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif - BUG_ON(!(current->flags & PF_DEAD)); + /* PF_DEAD causes final put_task_struct after we schedule. */ + preempt_disable(); + BUG_ON(tsk->flags & PF_DEAD); + tsk->flags |= PF_DEAD; + schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1383,6 +1383,15 @@ repeat: switch (p->state) { case TASK_TRACED: + /* + * When we hit the race with PTRACE_ATTACH, + * we will not report this child. But the + * race means it has not yet been moved to + * our ptrace_children list, so we need to + * set the flag here to avoid a spurious ECHILD + * when the race happens with the only child. + */ + flag = 1; if (!my_ptrace_child(p)) continue; /*FALLTHROUGH*/ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3ff7b925c387..51df337b37db 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) /* * No locking required for CPU-local interrupts: */ - desc->handler->ack(irq); + if (desc->handler->ack) + desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); desc->handler->end(irq); return 1; } spin_lock(&desc->lock); - desc->handler->ack(irq); + if (desc->handler->ack) + desc->handler->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 13bcec151b57..39277dd6bf90 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -18,6 +18,7 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/proc_fs.h> +#include <linux/sched.h> /* for cond_resched */ #include <linux/mm.h> #include <asm/sections.h> diff --git a/kernel/kmod.c b/kernel/kmod.c index 44166e3bb8af..51a892063aaa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -131,14 +131,14 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - struct key *old_session; + struct key *new_session, *old_session; int retval; /* Unblock all signals and set the session keyring. */ - key_get(sub_info->ring); + new_session = key_get(sub_info->ring); flush_signals(current); spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(current, sub_info->ring); + old_session = __install_session_keyring(current, new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f3ea492ab44d..ce4915dd683a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <linux/spinlock.h> #include <linux/hash.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> #include <asm-generic/sections.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index f50f174e92da..e75950a1092c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind); int kthread_stop(struct task_struct *k) { + return kthread_stop_sem(k, NULL); +} +EXPORT_SYMBOL(kthread_stop); + +int kthread_stop_sem(struct task_struct *k, struct semaphore *s) +{ int ret; down(&kthread_stop_lock); @@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - wake_up_process(k); + if (s) + up(s); + else + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k) return ret; } -EXPORT_SYMBOL(kthread_stop); +EXPORT_SYMBOL(kthread_stop_sem); static __init int helper_init(void) { diff --git a/kernel/params.c b/kernel/params.c index 1a8614bac5d5..47ba69547945 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/err.h> +#include <linux/slab.h> #if 0 #define DEBUGP printk diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bf374fceb39c..91a894264941 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) /* * The task was cleaned up already, no future firings. */ - return; + goto out; /* * Fetch the current sample and update the timer's expiry time. @@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) { clear_dead_task(timer, now); - return; + goto out; } read_lock(&tasklist_lock); /* arm_timer needs it. */ } else { @@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) put_task_struct(p); timer->it.cpu.task = p = NULL; timer->it.cpu.expires.sched = 0; - read_unlock(&tasklist_lock); - return; + goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* * We've noticed that the thread is dead, but @@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * drop our task ref. */ clear_dead_task(timer, now); - read_unlock(&tasklist_lock); - return; + goto out_unlock; } cpu_clock_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); @@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ arm_timer(timer, now); +out_unlock: read_unlock(&tasklist_lock); + +out: + timer->it_overrun_last = timer->it_overrun; + timer->it_overrun = -1; + ++timer->it_requeue_pending; } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index dda3cda73c77..ea55c7a1cd75 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) return error; } -static void nanosleep_wake_up(unsigned long __data) -{ - struct task_struct *p = (struct task_struct *) __data; - - wake_up_process(p); -} - /* * The standard says that an absolute nanosleep call MUST wake up at * the requested time in spite of clock settings. Here is what we do: @@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock, int flags, struct timespec *tsave) { struct timespec t, dum; - struct timer_list new_timer; DECLARE_WAITQUEUE(abs_wqueue, current); u64 rq_time = (u64)0; s64 left; @@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock, ¤t_thread_info()->restart_block; abs_wqueue.flags = 0; - init_timer(&new_timer); - new_timer.expires = 0; - new_timer.data = (unsigned long) current; - new_timer.function = nanosleep_wake_up; abs = flags & TIMER_ABSTIME; if (restart_block->fn == clock_nanosleep_restart) { @@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock, if (left < (s64)0) break; - new_timer.expires = jiffies + left; - __set_current_state(TASK_INTERRUPTIBLE); - add_timer(&new_timer); - - schedule(); + schedule_timeout_interruptible(left); - del_timer_sync(&new_timer); left = rq_time - get_jiffies_64(); } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 2f438d0eaa13..c71eb4579c07 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -4,7 +4,7 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := main.o process.o console.o pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o obj-$(CONFIG_SUSPEND_SMP) += smp.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 761956e813f5..027322a564f4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -30,7 +30,6 @@ extern int swsusp_check(void); extern int swsusp_read(void); extern void swsusp_close(void); extern int swsusp_resume(void); -extern int swsusp_free(void); static int noresume = 0; @@ -93,10 +92,7 @@ static void free_some_memory(void) printk("Freeing memory... "); while ((tmp = shrink_all_memory(10000))) { pages += tmp; - printk("\b%c", p[i]); - i++; - if (i > 3) - i = 0; + printk("\b%c", p[i++ % 4]); } printk("\bdone (%li pages freed)\n", pages); } @@ -178,13 +174,12 @@ int pm_suspend_disk(void) goto Done; if (in_suspend) { + device_resume(); pr_debug("PM: writing image.\n"); error = swsusp_write(); if (!error) power_down(pm_disk_mode); else { - /* swsusp_write can not fail in device_resume, - no need to do second device_resume */ swsusp_free(); unprepare_processes(); return error; @@ -252,14 +247,17 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) - goto Cleanup; + if ((error = swsusp_read())) { + swsusp_free(); + goto Thaw; + } pr_debug("PM: Preparing devices for restore.\n"); if ((error = device_suspend(PMSG_FREEZE))) { printk("Some devices failed to suspend\n"); - goto Free; + swsusp_free(); + goto Thaw; } mb(); @@ -268,9 +266,7 @@ static int software_resume(void) swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); device_resume(); - Free: - swsusp_free(); - Cleanup: + Thaw: unprepare_processes(); Done: /* For success case, the suspend path will release the lock */ diff --git a/kernel/power/main.c b/kernel/power/main.c index 22bdc93cc038..18d7d693fbba 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state) { int error; + if (pm_ops->valid && !pm_ops->valid(state)) + return -ENODEV; if (down_trylock(&pm_sem)) return -EBUSY; @@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) char * s = buf; for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i]) + if (pm_states[i] && pm_ops && (!pm_ops->valid + ||(pm_ops->valid && pm_ops->valid(i)))) s += sprintf(s,"%s ",pm_states[i]); } s += sprintf(s,"\n"); diff --git a/kernel/power/power.h b/kernel/power/power.h index 6748de23e83c..d4fd96a135ab 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -53,3 +53,20 @@ extern void thaw_processes(void); extern int pm_prepare_console(void); extern void pm_restore_console(void); + + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +extern unsigned int nr_copy_pages; +extern suspend_pagedir_t *pagedir_nosave; +extern suspend_pagedir_t *pagedir_save; + +extern asmlinkage int swsusp_arch_suspend(void); +extern asmlinkage int swsusp_arch_resume(void); + +extern int restore_highmem(void); +extern struct pbe * alloc_pagedir(unsigned nr_pages); +extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); +extern void swsusp_free(void); +extern int enough_swap(unsigned nr_pages); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 000000000000..42a628704398 --- /dev/null +++ b/kernel/power/snapshot.c @@ -0,0 +1,435 @@ +/* + * linux/kernel/power/snapshot.c + * + * This file provide system snapshot/restore functionality. + * + * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * + * This file is released under the GPLv2, and is based on swsusp.c. + * + */ + + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/suspend.h> +#include <linux/smp_lock.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/bootmem.h> +#include <linux/syscalls.h> +#include <linux/console.h> +#include <linux/highmem.h> + +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +#include "power.h" + +#ifdef CONFIG_HIGHMEM +struct highmem_page { + char *data; + struct page *page; + struct highmem_page *next; +}; + +static struct highmem_page *highmem_copy; + +static int save_highmem_zone(struct zone *zone) +{ + unsigned long zone_pfn; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + struct highmem_page *save; + void *kaddr; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. + */ + if (PageReserved(page)) { + printk("highmem reserved page?!\n"); + continue; + } + BUG_ON(PageNosave(page)); + if (PageNosaveFree(page)) + continue; + save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); + if (!save) + return -ENOMEM; + save->next = highmem_copy; + save->page = page; + save->data = (void *) get_zeroed_page(GFP_ATOMIC); + if (!save->data) { + kfree(save); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(save->data, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + highmem_copy = save; + } + return 0; +} + + +static int save_highmem(void) +{ + struct zone *zone; + int res = 0; + + pr_debug("swsusp: Saving Highmem\n"); + for_each_zone (zone) { + if (is_highmem(zone)) + res = save_highmem_zone(zone); + if (res) + return res; + } + return 0; +} + +int restore_highmem(void) +{ + printk("swsusp: Restoring Highmem\n"); + while (highmem_copy) { + struct highmem_page *save = highmem_copy; + void *kaddr; + highmem_copy = save->next; + + kaddr = kmap_atomic(save->page, KM_USER0); + memcpy(kaddr, save->data, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + free_page((long) save->data); + kfree(save); + } + return 0; +} +#else +static int save_highmem(void) { return 0; } +int restore_highmem(void) { return 0; } +#endif /* CONFIG_HIGHMEM */ + + +static int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +/** + * saveable - Determine whether a page should be cloned or not. + * @pfn: The page + * + * We save a page if it's Reserved, and not in the range of pages + * statically defined as 'unsaveable', or if it isn't reserved, and + * isn't part of a free chunk of pages. + */ + +static int saveable(struct zone *zone, unsigned long *zone_pfn) +{ + unsigned long pfn = *zone_pfn + zone->zone_start_pfn; + struct page *page; + + if (!pfn_valid(pfn)) + return 0; + + page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) + return 0; + if (PageReserved(page) && pfn_is_nosave(pfn)) { + pr_debug("[nosave pfn 0x%lx]", pfn); + return 0; + } + if (PageNosaveFree(page)) + return 0; + + return 1; +} + +static unsigned count_data_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned n; + + n = 0; + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + n += saveable(zone, &zone_pfn); + } + return n; +} + +static void copy_data_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *pbe, *p; + + pbe = pblist; + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + /* This is necessary for swsusp_free() */ + for_each_pb_page (p, pblist) + SetPageNosaveFree(virt_to_page(p)); + for_each_pbe (p, pblist) + SetPageNosaveFree(virt_to_page(p->address)); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + if (saveable(zone, &zone_pfn)) { + struct page *page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + BUG_ON(!pbe); + pbe->orig_address = (unsigned long)page_address(page); + /* copy_page is not usable for copying task structs. */ + memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + pbe = pbe->next; + } + } + } + BUG_ON(pbe); +} + + +/** + * free_pagedir - free pages allocated with alloc_pagedir() + */ + +static void free_pagedir(struct pbe *pblist) +{ + struct pbe *pbe; + + while (pblist) { + pbe = (pblist + PB_PAGE_SKIP)->next; + ClearPageNosave(virt_to_page(pblist)); + ClearPageNosaveFree(virt_to_page(pblist)); + free_page((unsigned long)pblist); + pblist = pbe; + } +} + +/** + * fill_pb_page - Create a list of PBEs on a given memory page + */ + +static inline void fill_pb_page(struct pbe *pbpage) +{ + struct pbe *p; + + p = pbpage; + pbpage += PB_PAGE_SKIP; + do + p->next = p + 1; + while (++p < pbpage); +} + +/** + * create_pbe_list - Create a list of PBEs on top of a given chain + * of memory pages allocated with alloc_pagedir() + */ + +void create_pbe_list(struct pbe *pblist, unsigned nr_pages) +{ + struct pbe *pbpage, *p; + unsigned num = PBES_PER_PAGE; + + for_each_pb_page (pbpage, pblist) { + if (num >= nr_pages) + break; + + fill_pb_page(pbpage); + num += PBES_PER_PAGE; + } + if (pbpage) { + for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) + p->next = p + 1; + p->next = NULL; + } + pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +static void *alloc_image_page(void) +{ + void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); + if (res) { + SetPageNosave(virt_to_page(res)); + SetPageNosaveFree(virt_to_page(res)); + } + return res; +} + +/** + * alloc_pagedir - Allocate the page directory. + * + * First, determine exactly how many pages we need and + * allocate them. + * + * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE + * struct pbe elements (pbes) and the last element in the page points + * to the next page. + * + * On each page we set up a list of struct_pbe elements. + */ + +struct pbe *alloc_pagedir(unsigned nr_pages) +{ + unsigned num; + struct pbe *pblist, *pbe; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); + pblist = alloc_image_page(); + /* FIXME: rewrite this ugly loop */ + for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; + pbe = pbe->next, num += PBES_PER_PAGE) { + pbe += PB_PAGE_SKIP; + pbe->next = alloc_image_page(); + } + if (!pbe) { /* get_zeroed_page() failed */ + free_pagedir(pblist); + pblist = NULL; + } + return pblist; +} + +/** + * Free pages we allocated for suspend. Suspend pages are alocated + * before atomic copy, so we need to free them after resume. + */ + +void swsusp_free(void) +{ + struct zone *zone; + unsigned long zone_pfn; + + for_each_zone(zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { + struct page * page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + if (PageNosave(page) && PageNosaveFree(page)) { + ClearPageNosave(page); + ClearPageNosaveFree(page); + free_page((long) page_address(page)); + } + } + } +} + + +/** + * enough_free_mem - Make sure we enough free memory to snapshot. + * + * Returns TRUE or FALSE after checking the number of available + * free pages. + */ + +static int enough_free_mem(unsigned nr_pages) +{ + pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); + return nr_free_pages() > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + + +static struct pbe *swsusp_alloc(unsigned nr_pages) +{ + struct pbe *pblist, *p; + + if (!(pblist = alloc_pagedir(nr_pages))) { + printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); + return NULL; + } + create_pbe_list(pblist, nr_pages); + + for_each_pbe (p, pblist) { + p->address = (unsigned long)alloc_image_page(); + if (!p->address) { + printk(KERN_ERR "suspend: Allocating image pages failed.\n"); + swsusp_free(); + return NULL; + } + } + + return pblist; +} + +asmlinkage int swsusp_save(void) +{ + unsigned nr_pages; + + pr_debug("swsusp: critical section: \n"); + if (save_highmem()) { + printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n"); + restore_highmem(); + return -ENOMEM; + } + + drain_local_pages(); + nr_pages = count_data_pages(); + printk("swsusp: Need to copy %u pages\n", nr_pages); + + pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", + nr_pages, + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, + PAGES_FOR_IO, nr_free_pages()); + + /* This is needed because of the fixed size of swsusp_info */ + if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) + return -ENOSPC; + + if (!enough_free_mem(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free memory\n"); + return -ENOMEM; + } + + if (!enough_swap(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free swap\n"); + return -ENOSPC; + } + + pagedir_nosave = swsusp_alloc(nr_pages); + if (!pagedir_nosave) + return -ENOMEM; + + /* During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(); + copy_data_pages(pagedir_nosave); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + + nr_copy_pages = nr_pages; + + printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + return 0; +} diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 016504ccfccf..12db1d2ad61f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -1,11 +1,10 @@ /* * linux/kernel/power/swsusp.c * - * This file is to realize architecture-independent - * machine suspend feature using pretty near only high-level routines + * This file provides code to write suspend image to swap and read it back. * * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> * * This file is released under the GPLv2. * @@ -47,11 +46,7 @@ #include <linux/utsname.h> #include <linux/version.h> #include <linux/delay.h> -#include <linux/reboot.h> #include <linux/bitops.h> -#include <linux/vt_kern.h> -#include <linux/kbd_kern.h> -#include <linux/keyboard.h> #include <linux/spinlock.h> #include <linux/genhd.h> #include <linux/kernel.h> @@ -63,10 +58,8 @@ #include <linux/swapops.h> #include <linux/bootmem.h> #include <linux/syscalls.h> -#include <linux/console.h> #include <linux/highmem.h> #include <linux/bio.h> -#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -84,16 +77,10 @@ #define MAXKEY 32 #define MAXIV 32 -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - -/* Variables to be preserved over suspend */ -static int nr_copy_pages_check; - extern char resume_file[]; /* Local variables that should not be affected by save */ -static unsigned int nr_copy_pages __nosavedata = 0; +unsigned int nr_copy_pages __nosavedata = 0; /* Suspend pagedir is allocated before final copy, therefore it must be freed after resume @@ -109,7 +96,7 @@ static unsigned int nr_copy_pages __nosavedata = 0; MMU hardware. */ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; -static suspend_pagedir_t *pagedir_save; +suspend_pagedir_t *pagedir_save; #define SWSUSP_SIG "S1SUSPEND" @@ -124,12 +111,6 @@ static struct swsusp_header { static struct swsusp_info swsusp_info; /* - * XXX: We try to keep some more pages free so that I/O operations succeed - * without paging. Might this be more? - */ -#define PAGES_FOR_IO 512 - -/* * Saving part... */ @@ -552,353 +533,6 @@ static int write_suspend_image(void) goto Done; } - -#ifdef CONFIG_HIGHMEM -struct highmem_page { - char *data; - struct page *page; - struct highmem_page *next; -}; - -static struct highmem_page *highmem_copy; - -static int save_highmem_zone(struct zone *zone) -{ - unsigned long zone_pfn; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - struct page *page; - struct highmem_page *save; - void *kaddr; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - - if (!(pfn%1000)) - printk("."); - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - /* - * PageReserved results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. - * - * rvmalloc should not cause this, because all implementations - * appear to always be using vmalloc_32 on architectures with - * highmem. This is a good thing, because we would like to save - * rvmalloc pages. - * - * It appears to be triggered by pages which do not point to - * valid memory (see arch/i386/mm/init.c:one_highpage_init(), - * which sets PageReserved if the page does not point to valid - * RAM. - * - * XXX: must remove usage of PageReserved! - */ - if (PageReserved(page)) - continue; - BUG_ON(PageNosave(page)); - if (PageNosaveFree(page)) - continue; - save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); - if (!save) - return -ENOMEM; - save->next = highmem_copy; - save->page = page; - save->data = (void *) get_zeroed_page(GFP_ATOMIC); - if (!save->data) { - kfree(save); - return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); - memcpy(save->data, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - highmem_copy = save; - } - return 0; -} -#endif /* CONFIG_HIGHMEM */ - - -static int save_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int res = 0; - - pr_debug("swsusp: Saving Highmem\n"); - for_each_zone (zone) { - if (is_highmem(zone)) - res = save_highmem_zone(zone); - if (res) - return res; - } -#endif - return 0; -} - -static int restore_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - printk("swsusp: Restoring Highmem\n"); - while (highmem_copy) { - struct highmem_page *save = highmem_copy; - void *kaddr; - highmem_copy = save->next; - - kaddr = kmap_atomic(save->page, KM_USER0); - memcpy(kaddr, save->data, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - free_page((long) save->data); - kfree(save); - } -#endif - return 0; -} - - -static int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -/** - * saveable - Determine whether a page should be cloned or not. - * @pfn: The page - * - * We save a page if it's Reserved, and not in the range of pages - * statically defined as 'unsaveable', or if it isn't reserved, and - * isn't part of a free chunk of pages. - */ - -static int saveable(struct zone * zone, unsigned long * zone_pfn) -{ - unsigned long pfn = *zone_pfn + zone->zone_start_pfn; - struct page * page; - - if (!pfn_valid(pfn)) - return 0; - - page = pfn_to_page(pfn); - if (PageNosave(page)) - return 0; - if (pfn_is_nosave(pfn)) { - pr_debug("[nosave pfn 0x%lx]", pfn); - return 0; - } - if (PageNosaveFree(page)) - return 0; - - return 1; -} - -static void count_data_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - - nr_copy_pages = 0; - - for_each_zone (zone) { - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - nr_copy_pages += saveable(zone, &zone_pfn); - } -} - - -static void copy_data_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - struct pbe * pbe = pagedir_nosave; - - pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); - for_each_zone (zone) { - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - if (saveable(zone, &zone_pfn)) { - struct page * page; - page = pfn_to_page(zone_pfn + zone->zone_start_pfn); - BUG_ON(!pbe); - pbe->orig_address = (long) page_address(page); - /* copy_page is not usable for copying task structs. */ - memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); - pbe = pbe->next; - } - } - } - BUG_ON(pbe); -} - - -/** - * calc_nr - Determine the number of pages needed for a pbe list. - */ - -static int calc_nr(int nr_copy) -{ - return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); -} - -/** - * free_pagedir - free pages allocated with alloc_pagedir() - */ - -static inline void free_pagedir(struct pbe *pblist) -{ - struct pbe *pbe; - - while (pblist) { - pbe = (pblist + PB_PAGE_SKIP)->next; - free_page((unsigned long)pblist); - pblist = pbe; - } -} - -/** - * fill_pb_page - Create a list of PBEs on a given memory page - */ - -static inline void fill_pb_page(struct pbe *pbpage) -{ - struct pbe *p; - - p = pbpage; - pbpage += PB_PAGE_SKIP; - do - p->next = p + 1; - while (++p < pbpage); -} - -/** - * create_pbe_list - Create a list of PBEs on top of a given chain - * of memory pages allocated with alloc_pagedir() - */ - -static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) -{ - struct pbe *pbpage, *p; - unsigned num = PBES_PER_PAGE; - - for_each_pb_page (pbpage, pblist) { - if (num >= nr_pages) - break; - - fill_pb_page(pbpage); - num += PBES_PER_PAGE; - } - if (pbpage) { - for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) - p->next = p + 1; - p->next = NULL; - } - pr_debug("create_pbe_list(): initialized %d PBEs\n", num); -} - -/** - * alloc_pagedir - Allocate the page directory. - * - * First, determine exactly how many pages we need and - * allocate them. - * - * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE - * struct pbe elements (pbes) and the last element in the page points - * to the next page. - * - * On each page we set up a list of struct_pbe elements. - */ - -static struct pbe * alloc_pagedir(unsigned nr_pages) -{ - unsigned num; - struct pbe *pblist, *pbe; - - if (!nr_pages) - return NULL; - - pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); - pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; - pbe = pbe->next, num += PBES_PER_PAGE) { - pbe += PB_PAGE_SKIP; - pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - } - if (!pbe) { /* get_zeroed_page() failed */ - free_pagedir(pblist); - pblist = NULL; - } - return pblist; -} - -/** - * free_image_pages - Free pages allocated for snapshot - */ - -static void free_image_pages(void) -{ - struct pbe * p; - - for_each_pbe (p, pagedir_save) { - if (p->address) { - ClearPageNosave(virt_to_page(p->address)); - free_page(p->address); - p->address = 0; - } - } -} - -/** - * alloc_image_pages - Allocate pages for the snapshot. - */ - -static int alloc_image_pages(void) -{ - struct pbe * p; - - for_each_pbe (p, pagedir_save) { - p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - if (!p->address) - return -ENOMEM; - SetPageNosave(virt_to_page(p->address)); - } - return 0; -} - -/* Free pages we allocated for suspend. Suspend pages are alocated - * before atomic copy, so we need to free them after resume. - */ -void swsusp_free(void) -{ - BUG_ON(PageNosave(virt_to_page(pagedir_save))); - BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); - free_image_pages(); - free_pagedir(pagedir_save); -} - - -/** - * enough_free_mem - Make sure we enough free memory to snapshot. - * - * Returns TRUE or FALSE after checking the number of available - * free pages. - */ - -static int enough_free_mem(void) -{ - if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { - pr_debug("swsusp: Not enough free pages: Have %d\n", - nr_free_pages()); - return 0; - } - return 1; -} - - /** * enough_swap - Make sure we have enough swap to save the image. * @@ -909,87 +543,14 @@ static int enough_free_mem(void) * We should only consider resume_device. */ -static int enough_swap(void) +int enough_swap(unsigned nr_pages) { struct sysinfo i; si_swapinfo(&i); - if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { - pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); - return 0; - } - return 1; -} - -static int swsusp_alloc(void) -{ - int error; - - pagedir_nosave = NULL; - nr_copy_pages = calc_nr(nr_copy_pages); - nr_copy_pages_check = nr_copy_pages; - - pr_debug("suspend: (pages needed: %d + %d free: %d)\n", - nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); - - if (!enough_free_mem()) - return -ENOMEM; - - if (!enough_swap()) - return -ENOSPC; - - if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE + - !!(nr_copy_pages % PBES_PER_PAGE)) - return -ENOSPC; - - if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { - printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); - return -ENOMEM; - } - create_pbe_list(pagedir_save, nr_copy_pages); - pagedir_nosave = pagedir_save; - if ((error = alloc_image_pages())) { - printk(KERN_ERR "suspend: Allocating image pages failed.\n"); - swsusp_free(); - return error; - } - - return 0; -} - -static int suspend_prepare_image(void) -{ - int error; - - pr_debug("swsusp: critical section: \n"); - if (save_highmem()) { - printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); - restore_highmem(); - return -ENOMEM; - } - - drain_local_pages(); - count_data_pages(); - printk("swsusp: Need to copy %u pages\n", nr_copy_pages); - - error = swsusp_alloc(); - if (error) - return error; - - /* During allocating of suspend pagedir, new cold pages may appear. - * Kill them. - */ - drain_local_pages(); - copy_data_pages(); - - /* - * End of critical section. From now on, we can write to memory, - * but we should not touch disk. This specially means we must _not_ - * touch swap space! Except we must write out our image of course. - */ - - printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); - return 0; + pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); + return i.freeswap > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } @@ -1001,7 +562,7 @@ static int suspend_prepare_image(void) int swsusp_write(void) { int error; - device_resume(); + lock_swapdevices(); error = write_suspend_image(); /* This will unlock ignored swap devices since writing is finished */ @@ -1011,14 +572,6 @@ int swsusp_write(void) } -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - - -asmlinkage int swsusp_save(void) -{ - return suspend_prepare_image(); -} int swsusp_suspend(void) { @@ -1050,7 +603,6 @@ int swsusp_suspend(void) printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); - BUG_ON (nr_copy_pages_check != nr_copy_pages); restore_highmem(); device_power_up(); local_irq_enable(); @@ -1070,6 +622,11 @@ int swsusp_resume(void) * execution continues at place where swsusp_arch_suspend was called */ BUG_ON(!error); + /* The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures + */ + swsusp_free(); restore_processor_state(); restore_highmem(); touch_softlockup_watchdog(); @@ -1085,54 +642,28 @@ int swsusp_resume(void) * * We don't know which pages are usable until we allocate them. * - * Allocated but unusable (ie eaten) memory pages are linked together - * to create a list, so that we can free them easily - * - * We could have used a type other than (void *) - * for this purpose, but ... + * Allocated but unusable (ie eaten) memory pages are marked so that + * swsusp_free() can release them */ -static void **eaten_memory = NULL; - -static inline void eat_page(void *page) -{ - void **c; - - c = eaten_memory; - eaten_memory = page; - *eaten_memory = c; -} -unsigned long get_usable_page(gfp_t gfp_mask) +unsigned long get_safe_page(gfp_t gfp_mask) { unsigned long m; - m = get_zeroed_page(gfp_mask); - while (!PageNosaveFree(virt_to_page(m))) { - eat_page((void *)m); + do { m = get_zeroed_page(gfp_mask); - if (!m) - break; + if (m && PageNosaveFree(virt_to_page(m))) + /* This is for swsusp_free() */ + SetPageNosave(virt_to_page(m)); + } while (m && PageNosaveFree(virt_to_page(m))); + if (m) { + /* This is for swsusp_free() */ + SetPageNosave(virt_to_page(m)); + SetPageNosaveFree(virt_to_page(m)); } return m; } -void free_eaten_memory(void) -{ - unsigned long m; - void **c; - int i = 0; - - c = eaten_memory; - while (c) { - m = (unsigned long)c; - c = *c; - free_page(m); - i++; - } - eaten_memory = NULL; - pr_debug("swsusp: %d unused pages freed\n", i); -} - /** * check_pagedir - We ensure here that pages that the PBEs point to * won't collide with pages where we're going to restore from the loaded @@ -1150,7 +681,7 @@ static int check_pagedir(struct pbe *pblist) p->address = 0UL; for_each_pbe (p, pblist) { - p->address = get_usable_page(GFP_ATOMIC); + p->address = get_safe_page(GFP_ATOMIC); if (!p->address) return -ENOMEM; } @@ -1169,7 +700,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) unsigned long zone_pfn; struct pbe *pbpage, *tail, *p; void *m; - int rel = 0, error = 0; + int rel = 0; if (!pblist) /* a sanity check */ return NULL; @@ -1177,41 +708,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", swsusp_info.pagedir_pages); - /* Set page flags */ + /* Clear page flags */ for_each_zone (zone) { for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - SetPageNosaveFree(pfn_to_page(zone_pfn + + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) + ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); } - /* Clear orig addresses */ + /* Mark orig addresses */ for_each_pbe (p, pblist) - ClearPageNosaveFree(virt_to_page(p->orig_address)); + SetPageNosaveFree(virt_to_page(p->orig_address)); tail = pblist + PB_PAGE_SKIP; /* Relocate colliding pages */ for_each_pb_page (pbpage, pblist) { - if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { - m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); - if (!m) { - error = -ENOMEM; - break; - } + if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) { + m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD); + if (!m) + return NULL; memcpy(m, (void *)pbpage, PAGE_SIZE); if (pbpage == pblist) pblist = (struct pbe *)m; else tail->next = (struct pbe *)m; - - eat_page((void *)pbpage); pbpage = (struct pbe *)m; /* We have to link the PBEs again */ - for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) if (p->next) /* needed to save the end */ p->next = p + 1; @@ -1221,15 +748,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) tail = pbpage + PB_PAGE_SKIP; } - if (error) { - printk("\nswsusp: Out of memory\n\n"); - free_pagedir(pblist); - free_eaten_memory(); - pblist = NULL; - /* Is this even worth handling? It should never ever happen, and we - have just lost user's state, anyway... */ - } else - printk("swsusp: Relocated %d pages\n", rel); + /* This is for swsusp_free() */ + for_each_pb_page (pbpage, pblist) { + SetPageNosave(virt_to_page(pbpage)); + SetPageNosaveFree(virt_to_page(pbpage)); + } + + printk("swsusp: Relocated %d pages\n", rel); return pblist; } @@ -1447,9 +972,7 @@ static int read_pagedir(struct pbe *pblist) break; } - if (error) - free_pagedir(pblist); - else + if (!error) BUG_ON(i != swsusp_info.pagedir_pages); return error; @@ -1492,15 +1015,6 @@ static int read_suspend_image(void) if (!error) error = data_read(pagedir_nosave); - if (error) { /* We fail cleanly */ - free_eaten_memory(); - for_each_pbe (p, pagedir_nosave) - if (p->address) { - free_page(p->address); - p->address = 0UL; - } - free_pagedir(pagedir_nosave); - } return error; } diff --git a/kernel/printk.c b/kernel/printk.c index 4b8f0f9230a4..3cb9708209bc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -10,7 +10,7 @@ * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. - * Fixed SMP synchronization, 08/08/99, Manfred Spraul + * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfreds@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton <andrewm@uow.edu.au> @@ -148,7 +148,7 @@ static int __init console_setup(char *str) if (!strcmp(str, "ttyb")) strcpy(name, "ttyS1"); #endif - for(s = name; *s; s++) + for (s = name; *s; s++) if ((*s >= '0' && *s <= '9') || *s == ',') break; idx = simple_strtoul(s, NULL, 10); @@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str) size = roundup_pow_of_two(size); if (size > log_buf_len) { unsigned long start, dest_idx, offset; - char * new_log_buf; + char *new_log_buf; new_log_buf = alloc_bootmem(size); if (!new_log_buf) { - printk("log_buf_len: allocation failed\n"); + printk(KERN_WARNING "log_buf_len: allocation failed\n"); goto out; } @@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str) log_end -= offset; spin_unlock_irqrestore(&logbuf_lock, flags); - printk("log_buf_len: %d\n", log_buf_len); + printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); } out: - return 1; } @@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup); * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user * buf, int len) +int do_syslog(int type, char __user *buf, int len) { unsigned long i, j, limit, count; int do_clear = 0; @@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len) error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, (log_start - log_end)); + error = wait_event_interruptible(log_wait, + (log_start - log_end)); if (error) goto out; i = 0; @@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len) error = i; break; case 4: /* Read/clear last kernel messages */ - do_clear = 1; + do_clear = 1; /* FALL THRU */ case 3: /* Read last kernel messages */ error = -EINVAL; @@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len) limit = log_end; /* * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages + * printk() could overwrite the messages * we try to copy to user space. Therefore * the messages are copied in reverse. <manfreds> */ - for(i = 0; i < count && !error; i++) { + for (i = 0; i < count && !error; i++) { j = limit-1-i; if (j + log_buf_len < log_end) break; @@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len) if (error) break; error = i; - if(i != count) { + if (i != count) { int offset = count-error; /* buffer overflow during copy, correct user buffer. */ - for(i=0;i<error;i++) { + for (i = 0; i < error; i++) { if (__get_user(c,&buf[i+offset]) || __put_user(c,&buf[i])) { error = -EFAULT; @@ -351,7 +351,7 @@ out: return error; } -asmlinkage long sys_syslog(int type, char __user * buf, int len) +asmlinkage long sys_syslog(int type, char __user *buf, int len) { return do_syslog(type, buf, len); } @@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end) cur_index = start; start_print = start; while (cur_index != end) { - if ( msg_level < 0 && - ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') - { + if (msg_level < 0 && ((end - cur_index) > 2) && + LOG_BUF(cur_index + 0) == '<' && + LOG_BUF(cur_index + 1) >= '0' && + LOG_BUF(cur_index + 1) <= '7' && + LOG_BUF(cur_index + 2) == '>') { msg_level = LOG_BUF(cur_index + 1) - '0'; cur_index += 3; start_print = cur_index; } while (cur_index != end) { char c = LOG_BUF(cur_index); - cur_index++; + cur_index++; if (c == '\n') { if (msg_level < 0) { /* @@ -461,7 +459,7 @@ static void zap_locks(void) static unsigned long oops_timestamp; if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30*HZ)) + !time_after(jiffies, oops_timestamp + 30 * HZ)) return; oops_timestamp = jiffies; @@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void) /* * This is printk. It can be called from any context. We want it to work. - * + * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output * into the log buffer and return. The current holder of the console_sem will @@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk); #else -asmlinkage long sys_syslog(int type, char __user * buf, int len) +asmlinkage long sys_syslog(int type, char __user *buf, int len) { return 0; } -int do_syslog(int type, char __user * buf, int len) { return 0; } -static void call_console_drivers(unsigned long start, unsigned long end) {} +int do_syslog(int type, char __user *buf, int len) +{ + return 0; +} + +static void call_console_drivers(unsigned long start, unsigned long end) +{ +} #endif @@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start); * print any messages that were printed by the kernel before the * console driver was initialized. */ -void register_console(struct console * console) +void register_console(struct console *console) { - int i; + int i; unsigned long flags; if (preferred_console < 0) @@ -878,7 +882,8 @@ void register_console(struct console * console) * See if this console matches one we selected on * the command line. */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; + i++) { if (strcmp(console_cmdline[i].name, console->name) != 0) continue; if (console->index >= 0 && @@ -933,9 +938,9 @@ void register_console(struct console * console) } EXPORT_SYMBOL(register_console); -int unregister_console(struct console * console) +int unregister_console(struct console *console) { - struct console *a,*b; + struct console *a, *b; int res = 1; acquire_console_sem(); @@ -949,10 +954,10 @@ int unregister_console(struct console * console) b->next = a->next; res = 0; break; - } + } } } - + /* If last console is removed, we re-enable picking the first * one that gets registered. Without that, pmac early boot console * would prevent fbcon from taking over. @@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned long toks = 10*5*HZ; + static unsigned long toks = 10 * 5 * HZ; static unsigned long last_msg; static int missed; unsigned long flags; @@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) toks = ratelimit_burst * ratelimit_jiffies; if (toks >= ratelimit_jiffies) { int lost = missed; + missed = 0; toks -= ratelimit_jiffies; spin_unlock_irqrestore(&ratelimit_lock, flags); @@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) EXPORT_SYMBOL(__printk_ratelimit); /* minimum time in jiffies between messages */ -int printk_ratelimit_jiffies = 5*HZ; +int printk_ratelimit_jiffies = 5 * HZ; /* number of messages we send before ratelimiting */ int printk_ratelimit_burst = 10; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 019e04ec065a..863eee8bff47 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child) signal_wake_up(child, 1); } } + if (child->signal->flags & SIGNAL_GROUP_EXIT) { + sigaddset(&child->pending.signal, SIGKILL); + signal_wake_up(child, 1); + } spin_unlock(&child->sighand->siglock); } @@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child) SET_LINKS(child); } - if (child->state == TASK_TRACED) - ptrace_untrace(child); + ptrace_untrace(child); } /* diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2559d4b8f23f..c4d159a21e04 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -154,6 +154,15 @@ void fastcall call_rcu_bh(struct rcu_head *head, } /* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} + +/* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. */ @@ -501,6 +510,7 @@ void synchronize_kernel(void) } module_param(maxbatch, int, 0); +EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 000000000000..9b58f1eff3ca --- /dev/null +++ b/kernel/rcutorture.c @@ -0,0 +1,492 @@ +/* + * Read-Copy Update /proc-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * + * See also: Documentation/RCU/torture.txt + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <asm/atomic.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/rcuref.h> +#include <linux/cpu.h> +#include <linux/random.h> +#include <linux/delay.h> +#include <linux/byteorder/swabb.h> +#include <linux/stat.h> + +MODULE_LICENSE("GPL"); + +static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ +static int stat_interval = 0; /* Interval between stats, in seconds. */ + /* Defaults to "only at end of test". */ +static int verbose = 0; /* Print more debug info. */ + +MODULE_PARM(nreaders, "i"); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +MODULE_PARM(stat_interval, "i"); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +MODULE_PARM(verbose, "i"); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); +#define TORTURE_FLAG "rcutorture: " +#define PRINTK_STRING(s) \ + do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) + +static char printk_buf[4096]; + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; +}; + +static int fullstop = 0; /* stop generating callbacks at test end. */ +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture *rcu_torture_current = NULL; +static long rcu_torture_current_version = 0; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = + { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = + { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +atomic_t n_rcu_torture_alloc; +atomic_t n_rcu_torture_alloc_fail; +atomic_t n_rcu_torture_free; + +/* + * Allocate an element from the rcu_tortures pool. + */ +struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock(&rcu_torture_lock); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) + rcu_torture_free(rp); + else + call_rcu(p, rcu_torture_cb); +} + +struct rcu_random_state { + unsigned long rrs_state; + unsigned long rrs_count; +}; + +#define RCU_RANDOM_MULT 39916801 /* prime */ +#define RCU_RANDOM_ADD 479001701 /* prime */ +#define RCU_RANDOM_REFRESH 10000 + +#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static long +rcu_random(struct rcu_random_state *rrsp) +{ + long refresh; + + if (--rrsp->rrs_count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rrsp->rrs_state += refresh; + rrsp->rrs_count = RCU_RANDOM_REFRESH; + } + rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; + return swahw32(rrsp->rrs_state); +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + int i; + long oldbatch = rcu_batches_completed(); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + do { + schedule_timeout_uninterruptible(1); + if (rcu_batches_completed() == oldbatch) + continue; + if ((rp = rcu_torture_alloc()) == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(rcu_random(&rand) & 0x3ff); + old_rp = rcu_torture_current; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); + if (old_rp != NULL) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); + } + rcu_torture_current_version++; + oldbatch = rcu_batches_completed(); + } while (!kthread_should_stop() && !fullstop); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + DEFINE_RCU_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + do { + rcu_read_lock(); + completed = rcu_batches_completed(); + p = rcu_dereference(rcu_torture_current); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + rcu_read_unlock(); + schedule_timeout_interruptible(HZ); + continue; + } + udelay(rcu_random(&rand) & 0x7f); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_count)[pipe_count]; + completed = rcu_batches_completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_batch)[completed]; + preempt_enable(); + rcu_read_unlock(); + schedule(); + } while (!kthread_should_stop() && !fullstop); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static int +rcu_torture_printk(char *page) +{ + int cnt = 0; + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + cnt += sprintf(&page[cnt], "rcutorture: "); + cnt += sprintf(&page[cnt], + "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + if (i > 1) + cnt += sprintf(&page[cnt], "!!! "); + cnt += sprintf(&page[cnt], "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + cnt += sprintf(&page[cnt], " %d", + atomic_read(&rcu_torture_wcount[i])); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int cnt; + + cnt = rcu_torture_printk(printk_buf); + printk(KERN_ALERT "%s", printk_buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static void +rcu_torture_cleanup(void) +{ + int i; + + fullstop = 1; + if (writer_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks != NULL) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (stats_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + /* Wait for all RCU callbacks to fire. */ + + for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + synchronize_rcu(); + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + PRINTK_STRING("--- End of test"); +} + +static int +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + + /* Process args and tell the world that the torturer is on the job. */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + printk(KERN_ALERT TORTURE_FLAG + "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", + nrealreaders, stat_interval, verbose); + fullstop = 0; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_run(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + return 0; + +unwind: + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/sched.c b/kernel/sched.c index 4f26c544d02c..b4f4eb613537 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1468,7 +1468,7 @@ void fastcall sched_exit(task_t *p) * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { + if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > task_timeslice(p))) p->parent->time_slice = task_timeslice(p); @@ -3877,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map = CPU_MASK_ALL; -EXPORT_SYMBOL_GPL(cpu_online_map); cpumask_t cpu_possible_map = CPU_MASK_ALL; #endif diff --git a/kernel/signal.c b/kernel/signal.c index 6904bbbfe116..1bf3c39d6109 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->lock = NULL; q->user = get_uid(t->user); } return(q); @@ -652,8 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; error = -EPERM; - if ((!info || ((unsigned long)info != 1 && - (unsigned long)info != 2 && SI_FROMUSER(info))) + if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) @@ -790,7 +788,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. */ - if ((unsigned long)info == 2) + if (info == SEND_SIG_FORCED) goto out_set; /* Real-time signals must be queued if sent by sigqueue, or @@ -802,19 +800,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, pass on the info struct. */ q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - ((unsigned long) info < 2 || + (is_si_special(info) || info->si_code >= 0))); if (q) { list_add_tail(&q->list, &signals->list); switch ((unsigned long) info) { - case 0: + case (unsigned long) SEND_SIG_NOINFO: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = current->pid; q->info.si_uid = current->uid; break; - case 1: + case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -825,20 +823,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, copy_siginfo(&q->info, info); break; } - } else { - if (sig >= SIGRTMIN && info && (unsigned long)info != 1 - && info->si_code != SI_USER) + } else if (!is_si_special(info)) { + if (sig >= SIGRTMIN && info->si_code != SI_USER) /* * Queue overflow, abort. We may abort if the signal was rt * and sent by user using something other than kill(). */ return -EAGAIN; - if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped - * the signal. - */ - ret = info->si_sys_private; } out_set: @@ -859,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) BUG(); assert_spin_locked(&t->sighand->siglock); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped the signal. - */ - ret = info->si_sys_private; - /* Short-circuit ignored signals. */ if (sig_ignored(t, sig)) goto out; @@ -894,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) int ret; spin_lock_irqsave(&t->sighand->siglock, flags); - if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { + if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + } + if (sigismember(&t->blocked, sig)) { sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); } + recalc_sigpending_tsk(t); ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -908,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) void force_sig_specific(int sig, struct task_struct *t) { - unsigned long int flags; - - spin_lock_irqsave(&t->sighand->siglock, flags); - if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) - t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; - sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); - specific_send_sig_info(sig, (void *)2, t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); + force_sig_info(sig, SEND_SIG_FORCED, t); } /* @@ -1051,12 +1030,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) assert_spin_locked(&p->sighand->siglock); handle_stop_signal(sig, p); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped the signal. - */ - ret = info->si_sys_private; - /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) return ret; @@ -1109,8 +1082,8 @@ void zap_other_threads(struct task_struct *p) if (t != p->group_leader) t->exit_signal = -1; + /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); signal_wake_up(t, 1); } } @@ -1286,10 +1259,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) return ret; } +#define __si_special(priv) \ + ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) + int send_sig(int sig, struct task_struct *p, int priv) { - return send_sig_info(sig, (void*)(long)(priv != 0), p); + return send_sig_info(sig, __si_special(priv), p); } /* @@ -1309,7 +1285,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) void force_sig(int sig, struct task_struct *p) { - force_sig_info(sig, (void*)1L, p); + force_sig_info(sig, SEND_SIG_PRIV, p); } /* @@ -1334,13 +1310,13 @@ force_sigsegv(int sig, struct task_struct *p) int kill_pg(pid_t pgrp, int sig, int priv) { - return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); + return kill_pg_info(sig, __si_special(priv), pgrp); } int kill_proc(pid_t pid, int sig, int priv) { - return kill_proc_info(sig, (void *)(long)(priv != 0), pid); + return kill_proc_info(sig, __si_special(priv), pid); } /* @@ -1371,11 +1347,12 @@ void sigqueue_free(struct sigqueue *q) * pending queue. */ if (unlikely(!list_empty(&q->list))) { - read_lock(&tasklist_lock); - spin_lock_irqsave(q->lock, flags); + spinlock_t *lock = ¤t->sighand->siglock; + read_lock(&tasklist_lock); + spin_lock_irqsave(lock, flags); if (!list_empty(&q->list)) list_del_init(&q->list); - spin_unlock_irqrestore(q->lock, flags); + spin_unlock_irqrestore(lock, flags); read_unlock(&tasklist_lock); } q->flags &= ~SIGQUEUE_PREALLOC; @@ -1414,7 +1391,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) goto out; } - q->lock = &p->sighand->siglock; list_add_tail(&q->list, &p->pending.list); sigaddset(&p->pending.signal, sig); if (!sigismember(&p->blocked, sig)) @@ -1462,7 +1438,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) * We always use the shared queue for process-wide signals, * to avoid several races. */ - q->lock = &p->sighand->siglock; list_add_tail(&q->list, &p->signal->shared_pending.list); sigaddset(&p->signal->shared_pending.signal, sig); @@ -1881,9 +1856,9 @@ relock: /* Let the debugger run. */ ptrace_stop(signr, signr, info); - /* We're back. Did the debugger cancel the sig? */ + /* We're back. Did the debugger cancel the sig or group_exit? */ signr = current->exit_code; - if (signr == 0) + if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) continue; current->exit_code = 0; @@ -2285,26 +2260,13 @@ sys_kill(int pid, int sig) return kill_something_info(sig, &info, pid); } -/** - * sys_tgkill - send signal to one specific thread - * @tgid: the thread group ID of the thread - * @pid: the PID of the thread - * @sig: signal to be sent - * - * This syscall also checks the tgid and returns -ESRCH even if the PID - * exists but it's not belonging to the target process anymore. This - * method solves the problem of threads exiting and PIDs getting reused. - */ -asmlinkage long sys_tgkill(int tgid, int pid, int sig) +static int do_tkill(int tgid, int pid, int sig) { - struct siginfo info; int error; + struct siginfo info; struct task_struct *p; - /* This is only valid for single tasks */ - if (pid <= 0 || tgid <= 0) - return -EINVAL; - + error = -ESRCH; info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; @@ -2313,8 +2275,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) read_lock(&tasklist_lock); p = find_task_by_pid(pid); - error = -ESRCH; - if (p && (p->tgid == tgid)) { + if (p && (tgid <= 0 || p->tgid == tgid)) { error = check_kill_permission(sig, &info, p); /* * The null signal is a permissions and process existence @@ -2328,47 +2289,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) } } read_unlock(&tasklist_lock); + return error; } +/** + * sys_tgkill - send signal to one specific thread + * @tgid: the thread group ID of the thread + * @pid: the PID of the thread + * @sig: signal to be sent + * + * This syscall also checks the tgid and returns -ESRCH even if the PID + * exists but it's not belonging to the target process anymore. This + * method solves the problem of threads exiting and PIDs getting reused. + */ +asmlinkage long sys_tgkill(int tgid, int pid, int sig) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + return do_tkill(tgid, pid, sig); +} + /* * Send a signal to only one task, even if it's a CLONE_THREAD task. */ asmlinkage long sys_tkill(int pid, int sig) { - struct siginfo info; - int error; - struct task_struct *p; - /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = current->tgid; - info.si_uid = current->uid; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - error = -ESRCH; - if (p) { - error = check_kill_permission(sig, &info, p); - /* - * The null signal is a permissions and process existence - * probe. No signal is actually delivered. - */ - if (!error && sig && p->sighand) { - spin_lock_irq(&p->sighand->siglock); - handle_stop_signal(sig, p); - error = specific_send_sig_info(sig, &info, p); - spin_unlock_irq(&p->sighand->siglock); - } - } - read_unlock(&tasklist_lock); - return error; + return do_tkill(0, pid, sig); } asmlinkage long diff --git a/kernel/time.c b/kernel/time.c index a3c2100470e1..245d595a13cb 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc) if (mtemp >= MINSEC) { ltemp = (time_offset / mtemp) << (SHIFT_USEC - SHIFT_UPDATE); - if (ltemp < 0) - time_freq -= -ltemp >> SHIFT_KH; - else - time_freq += ltemp >> SHIFT_KH; + time_freq += shift_right(ltemp, SHIFT_KH); } else /* calibration interval too short (p. 12) */ result = TIME_ERROR; } else { /* PLL mode */ if (mtemp < MAXSEC) { ltemp *= mtemp; - if (ltemp < 0) - time_freq -= -ltemp >> (time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC); - else - time_freq += ltemp >> (time_constant + + time_freq += shift_right(ltemp,(time_constant + time_constant + - SHIFT_KF - SHIFT_USEC); + SHIFT_KF - SHIFT_USEC)); } else /* calibration interval too long (p. 12) */ result = TIME_ERROR; } - if (time_freq > time_tolerance) - time_freq = time_tolerance; - else if (time_freq < -time_tolerance) - time_freq = -time_tolerance; + time_freq = min(time_freq, time_tolerance); + time_freq = max(time_freq, -time_tolerance); } /* STA_PLL || STA_PPSTIME */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) { @@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else { - if (time_offset < 0) - txc->offset = -(-time_offset >> SHIFT_UPDATE); - else - txc->offset = time_offset >> SHIFT_UPDATE; + txc->offset = shift_right(time_offset, SHIFT_UPDATE); } txc->freq = time_freq + pps_freq; txc->maxerror = time_maxerror; diff --git a/kernel/timer.c b/kernel/timer.c index 6a2e5f8dc725..fd74268d8663 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec); #define time_interpolator_update(x) #endif +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + /* * per-CPU timer vector definitions: */ @@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base, #endif } -static void check_timer_failed(struct timer_list *timer) -{ - static int whine_count; - if (whine_count < 16) { - whine_count++; - printk("Uninitialised timer!\n"); - printk("This is just a warning. Your computer is OK\n"); - printk("function=0x%p, data=0x%lx\n", - timer->function, timer->data); - dump_stack(); - } - /* - * Now fix it up - */ - timer->magic = TIMER_MAGIC; -} - -static inline void check_timer(struct timer_list *timer) -{ - if (timer->magic != TIMER_MAGIC) - check_timer_failed(timer); -} - - static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) { unsigned long expires = timer->expires; @@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; - timer->magic = TIMER_MAGIC; } EXPORT_SYMBOL(init_timer); @@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) int ret = 0; BUG_ON(!timer->function); - check_timer(timer); base = lock_timer_base(timer, &flags); @@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu) unsigned long flags; BUG_ON(timer_pending(timer) || !timer->function); - - check_timer(timer); - spin_lock_irqsave(&base->t_base.lock, flags); timer->base = &base->t_base; internal_add_timer(base, timer); @@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); - check_timer(timer); - /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; - check_timer(timer); - if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -412,8 +383,6 @@ out: */ int del_timer_sync(struct timer_list *timer) { - check_timer(timer); - for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -632,143 +601,118 @@ long time_next_adjust; */ static void second_overflow(void) { - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if ( time_maxerror > NTP_PHASE_LIMIT ) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at - * the end of the day, the system clock is set back one - * second; if in leap-delete state, the system clock is - * set ahead one second. The microtime() routine or - * external clock driver will insure that reported time - * is always monotonic. The ugly divides should be - * replaced. - */ - switch (time_state) { - - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* The timer interpolator will make time change gradually instead - * of an immediate jump by one second. - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; } - break; - - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* Use of time interpolator for a gradual change of time */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; } - break; - - case TIME_OOP: - time_state = TIME_WAIT; - break; - - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In - * PLL mode, the offset is reduced by a fixed factor - * times the time constant. In FLL mode the offset is - * used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread - * the adjustment over not more than the number of - * seconds between updates. - */ - if (time_offset < 0) { - ltemp = -time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset += ltemp; - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } else { + + /* + * Compute the phase adjustment for the next second. In PLL mode, the + * offset is reduced by a fixed factor times the time constant. In FLL + * mode the offset is used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread the adjustment + * over not more than the number of seconds between updates. + */ ltemp = time_offset; if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + ltemp = shift_right(ltemp, SHIFT_KG + time_constant); + ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); + ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); time_offset -= ltemp; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } - - /* - * Compute the frequency estimate and additional phase - * adjustment due to frequency error for the next - * second. When the PPS signal is engaged, gnaw on the - * watchdog counter and update the frequency computed by - * the pll and the PPS signal. - */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; - if (ltemp < 0) - time_adj -= -ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - else - time_adj += ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + + /* + * Compute the frequency estimate and additional phase adjustment due + * to frequency error for the next second. When the PPS signal is + * engaged, gnaw on the watchdog counter and update the frequency + * computed by the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 - /* Compensate for (HZ==100) != (1 << SHIFT_HZ). - * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 2) + (-time_adj >> 5); - else - time_adj += (time_adj >> 2) + (time_adj >> 5); + /* + * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to + * get 128.125; => only 0.125% error (p. 14) + */ + time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); #endif #if HZ == 250 - /* Compensate for (HZ==250) != (1 << SHIFT_HZ). - * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 6) + (-time_adj >> 7); - else - time_adj += (time_adj >> 6) + (time_adj >> 7); + /* + * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 255.85938; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif #if HZ == 1000 - /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). - * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 6) + (-time_adj >> 7); - else - time_adj += (time_adj >> 6) + (time_adj >> 7); + /* + * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif } @@ -777,23 +721,20 @@ static void update_wall_time_one_tick(void) { long time_adjust_step, delta_nsec; - if ( (time_adjust_step = time_adjust) != 0 ) { - /* We are doing an adjtime thing. - * - * Prepare time_adjust_step to be within bounds. - * Note that a positive time_adjust means we want the clock - * to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; - - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; + if ((time_adjust_step = time_adjust) != 0 ) { + /* + * We are doing an adjtime thing. Prepare time_adjust_step to + * be within bounds. Note that a positive time_adjust means we + * want the clock to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + time_adjust_step = min(time_adjust_step, (long)tickadj); + time_adjust_step = max(time_adjust_step, (long)-tickadj); + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; } delta_nsec = tick_nsec + time_adjust_step * 1000; /* @@ -801,13 +742,8 @@ static void update_wall_time_one_tick(void) * advance the tick more. */ time_phase += time_adj; - if (time_phase <= -FINENSEC) { - long ltemp = -time_phase >> (SHIFT_SCALE - 10); - time_phase += ltemp << (SHIFT_SCALE - 10); - delta_nsec -= ltemp; - } - else if (time_phase >= FINENSEC) { - long ltemp = time_phase >> (SHIFT_SCALE - 10); + if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { + long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); time_phase -= ltemp << (SHIFT_SCALE - 10); delta_nsec += ltemp; } @@ -1137,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); + "value %lx from %p\n", timeout, + __builtin_return_address(0)); current->state = TASK_RUNNING; goto out; } @@ -1146,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; - - add_timer(&timer); + setup_timer(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire); schedule(); del_singleshot_timer_sync(&timer); @@ -1168,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout); */ signed long __sched schedule_timeout_interruptible(signed long timeout) { - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); signed long __sched schedule_timeout_uninterruptible(signed long timeout) { - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); @@ -1516,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec) if (!time_interpolator) return; - /* The interpolator compensates for late ticks by accumulating - * the late time in time_interpolator->offset. A tick earlier than - * expected will lead to a reset of the offset and a corresponding - * jump of the clock forward. Again this only works if the - * interpolator clock is running slightly slower than the regular clock - * and the tuning logic insures that. - */ + /* + * The interpolator compensates for late ticks by accumulating the late + * time in time_interpolator->offset. A tick earlier than expected will + * lead to a reset of the offset and a corresponding jump of the clock + * forward. Again this only works if the interpolator clock is running + * slightly slower than the regular clock and the tuning logic insures + * that. + */ counter = time_interpolator_get_counter(1); - offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); + offset = time_interpolator->offset + + GET_TI_NSECS(counter, time_interpolator); if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) time_interpolator->offset = offset - delta_nsec; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 91bacb13a7e2..7cee222231bc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -12,6 +12,8 @@ * Andrew Morton <andrewm@uow.edu.au> * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> + * + * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. */ #include <linux/module.h> @@ -57,7 +59,7 @@ struct cpu_workqueue_struct { * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct cpu_wq[NR_CPUS]; + struct cpu_workqueue_struct *cpu_wq; const char *name; struct list_head list; /* Empty if single thread */ }; @@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) if (unlikely(is_single_threaded(wq))) cpu = 0; BUG_ON(!list_empty(&work->entry)); - __queue_work(wq->cpu_wq + cpu, work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); ret = 1; } put_cpu(); @@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data) if (unlikely(is_single_threaded(wq))) cpu = 0; - __queue_work(wq->cpu_wq + cpu, work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } int fastcall queue_delayed_work(struct workqueue_struct *wq, @@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) if (is_single_threaded(wq)) { /* Always use cpu 0's area. */ - flush_cpu_workqueue(wq->cpu_wq + 0); + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0)); } else { int cpu; lock_cpu_hotplug(); for_each_online_cpu(cpu) - flush_cpu_workqueue(wq->cpu_wq + cpu); + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); unlock_cpu_hotplug(); } } @@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, int cpu) { - struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); struct task_struct *p; spin_lock_init(&cwq->lock); @@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name, if (!wq) return NULL; + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); @@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) unsigned long flags; struct task_struct *p; - cwq = wq->cpu_wq + cpu; + cwq = per_cpu_ptr(wq->cpu_wq, cpu); spin_lock_irqsave(&cwq->lock, flags); p = cwq->thread; cwq->thread = NULL; @@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock(&workqueue_lock); } unlock_cpu_hotplug(); + free_percpu(wq->cpu_wq); kfree(wq); } @@ -458,7 +462,7 @@ int current_is_keventd(void) BUG_ON(!keventd_wq); - cwq = keventd_wq->cpu_wq + cpu; + cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); if (current == cwq->thread) ret = 1; @@ -470,7 +474,7 @@ int current_is_keventd(void) /* Take the work from this (downed) CPU. */ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { - struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); LIST_HEAD(list); struct work_struct *work; @@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) printk("Taking work for %s\n", wq->name); work = list_entry(list.next,struct work_struct,entry); list_del(&work->entry); - __queue_work(wq->cpu_wq + smp_processor_id(), work); + __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); } spin_unlock_irq(&cwq->lock); } @@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: /* Kick off worker threads. */ list_for_each_entry(wq, &workqueues, list) { - kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); - wake_up_process(wq->cpu_wq[hotcpu].thread); + struct cpu_workqueue_struct *cwq; + + cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); + kthread_bind(cwq->thread, hotcpu); + wake_up_process(cwq->thread); } break; case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { /* Unbind so it can run. */ - kthread_bind(wq->cpu_wq[hotcpu].thread, + kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, smp_processor_id()); cleanup_workqueue_thread(wq, hotcpu); } |