diff options
author | Russell King <rmk+kernel@arm.linux.org.uk> | 2012-01-13 15:00:22 +0000 |
---|---|---|
committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2012-01-13 15:00:22 +0000 |
commit | 4de3a8e101150feaefa1139611a50ff37467f33e (patch) | |
tree | daada742542518b02d7db7c5d32e715eaa5f166d /kernel | |
parent | 294064f58953f9964e5945424b09c51800330a83 (diff) | |
parent | 099469502f62fbe0d7e4f0b83a2f22538367f734 (diff) | |
download | blackbird-op-linux-4de3a8e101150feaefa1139611a50ff37467f33e.tar.gz blackbird-op-linux-4de3a8e101150feaefa1139611a50ff37467f33e.zip |
Merge branch 'master' into fixes
Diffstat (limited to 'kernel')
45 files changed, 1085 insertions, 790 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 203dfead2e06..02e6167a53b0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct, * the cache line to have the data after getting the lock. */ struct bsd_acct_struct { - volatile int active; - volatile int needcheck; + int active; + unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct timer_list timer; struct list_head list; }; @@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock); static LIST_HEAD(acct_list); /* - * Called whenever the timer says to check the free space. - */ -static void acct_timeout(unsigned long x) -{ - struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; - acct->needcheck = 1; -} - -/* * Check the amount of free space and suspend/resume accordingly. */ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) @@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) struct kstatfs sbuf; int res; int act; - sector_t resume; - sector_t suspend; + u64 resume; + u64 suspend; spin_lock(&acct_lock); res = acct->active; - if (!file || !acct->needcheck) + if (!file || time_is_before_jiffies(acct->needcheck)) goto out; spin_unlock(&acct_lock); @@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; - sector_div(suspend, 100); - sector_div(resume, 100); + do_div(suspend, 100); + do_div(resume, 100); if (sbuf.f_bavail <= suspend) act = -1; @@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) } } - del_timer(&acct->timer); - acct->needcheck = 0; - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); + acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; res = acct->active; out: spin_unlock(&acct_lock); @@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (acct->file) { old_acct = acct->file; old_ns = acct->ns; - del_timer(&acct->timer); acct->active = 0; - acct->needcheck = 0; acct->file = NULL; acct->ns = NULL; list_del(&acct->list); @@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (file) { acct->file = file; acct->ns = ns; - acct->needcheck = 0; + acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; acct->active = 1; list_add(&acct->list, &acct_list); - /* It's been deleted if it was used before so this is safe */ - setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); } if (old_acct) { mnt_unpin(old_acct->f_path.mnt); @@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + if (acct->file && acct->file->f_path.dentry->d_sb == sb) { acct_file_reopen(acct, NULL, NULL); goto restart; } @@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns) if (acct == NULL) return; - del_timer_sync(&acct->timer); spin_lock(&acct_lock); if (acct->file != NULL) acct_file_reopen(acct, NULL, NULL); @@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset((caddr_t)&ac, 0, sizeof(acct_t)); + memset(&ac, 0, sizeof(acct_t)); ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); diff --git a/kernel/audit.c b/kernel/audit.c index 09fae2677a45..2c1d6ab7106e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) - goto out; + goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } - va_end(args2); if (len > 0) skb_put(skb, len); +out_va_end: + va_end(args2); out: return; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 47b7fc1ea893..e7fe2b0d29b3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -210,12 +210,12 @@ struct audit_context { struct { uid_t uid; gid_t gid; - mode_t mode; + umode_t mode; u32 osid; int has_perm; uid_t perm_uid; gid_t perm_gid; - mode_t perm_mode; + umode_t perm_mode; unsigned long qbytes; } ipc; struct { @@ -234,7 +234,7 @@ struct audit_context { } mq_sendrecv; struct { int oflag; - mode_t mode; + umode_t mode; struct mq_attr attr; } mq_open; struct { @@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; - mode_t mode = which & S_IFMT; + umode_t mode = which & S_IFMT; if (unlikely(!ctx)) return 0; @@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic) case AUDIT_IPC: { u32 osid = context->ipc.osid; - audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", context->ipc.uid, context->ipc.gid, context->ipc.mode); if (osid) { char *ctx = NULL; @@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic) ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", + "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, @@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic) break; } case AUDIT_MQ_OPEN: { audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", context->mq_open.oflag, context->mq_open.mode, context->mq_open.attr.mq_flags, @@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (n->ino != (unsigned long)-1) { audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#o" + " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), @@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @attr: queue attributes * */ -void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) +void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { struct audit_context *context = current->audit_context; @@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Called only after audit_ipc_obj(). */ -void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { struct audit_context *context = current->audit_context; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a184470cf9b5..a5d3b5325f77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,7 +63,24 @@ #include <linux/atomic.h> +/* + * cgroup_mutex is the master lock. Any modification to cgroup or its + * hierarchy must be performed while holding it. + * + * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify + * cgroupfs_root of any cgroup hierarchy - subsys list, flags, + * release_agent_path and so on. Modifying requires both cgroup_mutex and + * cgroup_root_mutex. Readers can acquire either of the two. This is to + * break the following locking order cycle. + * + * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem + * B. namespace_sem -> cgroup_mutex + * + * B happens only through cgroup_show_options() and using cgroup_root_mutex + * breaks it. + */ static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is @@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); @@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct cgroup *child); -static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) +static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ -DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); +static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { @@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); + BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; @@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, return 0; } -static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { - struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) @@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* * If the 'all' option was specified select all the subsystems, - * otherwise 'all, 'none' and a subsystem name options were not - * specified, let's default to 'all' + * otherwise if 'none', 'name=' and a subsystem name options + * were not specified, let's default to 'all' */ - if (all_ss || (!all_ss && !one_ss && !opts->none)) { + if (all_ss || (!one_ss && !opts->none && !opts->name)) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss == NULL) @@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) out_unlock: kfree(opts.release_agent); kfree(opts.name); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; @@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct inode *inode; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* We used the new root structure, so this is a new hierarchy */ struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; - struct inode *inode; struct cgroupfs_root *existing_root; const struct cred *cred; int i; @@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); - if (strlen(root->name)) { - /* Check for name clashes with existing mounts */ - for_each_active_root(existing_root) { - if (!strcmp(existing_root->name, root->name)) { - ret = -EBUSY; - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } - } - } + /* Check for name clashes with existing mounts */ + ret = -EBUSY; + if (strlen(root->name)) + for_each_active_root(existing_root) + if (!strcmp(existing_root->name, root->name)) + goto unlock_drop; /* * We're accessing css_set_count without locking @@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * have some link structures left over */ ret = allocate_cg_links(css_set_count, &tmp_cg_links); - if (ret) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } + if (ret) + goto unlock_drop; ret = rebind_subsystems(root, root->subsys_bits); if (ret == -EBUSY) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); free_cg_links(&tmp_cg_links); - goto drop_new_super; + goto unlock_drop; } /* * There must be no failure case after here, since rebinding @@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cred = override_creds(&init_cred); cgroup_populate_dir(root_cgrp); revert_creds(cred); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { @@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + unlock_drop: + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); drop_modules: @@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ ret = rebind_subsystems(root, 0); @@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) { root_count--; } + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); kill_litter_super(sb); @@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) EXPORT_SYMBOL_GPL(cgroup_path); /* + * Control Group taskset + */ +struct task_and_cgroup { + struct task_struct *task; + struct cgroup *cgrp; +}; + +struct cgroup_taskset { + struct task_and_cgroup single; + struct flex_array *tc_array; + int tc_array_len; + int idx; + struct cgroup *cur_cgrp; +}; + +/** + * cgroup_taskset_first - reset taskset and return the first task + * @tset: taskset of interest + * + * @tset iteration is initialized and the first task is returned. + */ +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +{ + if (tset->tc_array) { + tset->idx = 0; + return cgroup_taskset_next(tset); + } else { + tset->cur_cgrp = tset->single.cgrp; + return tset->single.task; + } +} +EXPORT_SYMBOL_GPL(cgroup_taskset_first); + +/** + * cgroup_taskset_next - iterate to the next task in taskset + * @tset: taskset of interest + * + * Return the next task in @tset. Iteration must have been initialized + * with cgroup_taskset_first(). + */ +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +{ + struct task_and_cgroup *tc; + + if (!tset->tc_array || tset->idx >= tset->tc_array_len) + return NULL; + + tc = flex_array_get(tset->tc_array, tset->idx++); + tset->cur_cgrp = tc->cgrp; + return tc->task; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_next); + +/** + * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * @tset: taskset of interest + * + * Return the cgroup for the current (last returned) task of @tset. This + * function must be preceded by either cgroup_taskset_first() or + * cgroup_taskset_next(). + */ +struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +{ + return tset->cur_cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); + +/** + * cgroup_taskset_size - return the number of tasks in taskset + * @tset: taskset of interest + */ +int cgroup_taskset_size(struct cgroup_taskset *tset) +{ + return tset->tc_array ? tset->tc_array_len : 1; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_size); + + +/* * cgroup_task_migrate - move a task from one cgroup to another. * * 'guarantee' is set if the caller promises that a new css_set for the task * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Otherwise, it can only fail with -ESRCH. + * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, bool guarantee) @@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct css_set *newcg; /* - * get old css_set. we need to take task_lock and refcount it, because - * an exiting task can change its css_set to init_css_set and drop its - * old one without taking cgroup_mutex. + * We are synchronized through threadgroup_lock() against PF_EXITING + * setting such that we can't race against cgroup_exit() changing the + * css_set to init_css_set and dropping the old one. */ - task_lock(tsk); + WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - get_css_set(oldcg); - task_unlock(tsk); /* locate or allocate a new css_set for this task. */ if (guarantee) { @@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, might_sleep(); /* find_css_set will give us newcg already referenced. */ newcg = find_css_set(oldcg, cgrp); - if (!newcg) { - put_css_set(oldcg); + if (!newcg) return -ENOMEM; - } } - put_css_set(oldcg); - /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - return -ESRCH; - } rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * @cgrp: the cgroup the task is attaching to * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'tsk' during call. + * Call with cgroup_mutex and threadgroup locked. May take task_lock of + * @tsk during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; + struct cgroup_taskset tset = { }; + + /* @tsk either already exited or can't exit until the end */ + if (tsk->flags & PF_EXITING) + return -ESRCH; /* Nothing to do if the task is already in that cgroup */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; + tset.single.task = tsk; + tset.single.cgrp = oldcgrp; + for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } - if (ss->can_attach_task) { - retval = ss->can_attach_task(cgrp, tsk); - if (retval) { - failed_ss = ss; - goto out; - } - } } retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); @@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; for_each_subsys(root, ss) { - if (ss->pre_attach) - ss->pre_attach(cgrp); - if (ss->attach_task) - ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, &tset); } synchronize_rcu(); @@ -1884,7 +1967,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk); + ss->cancel_attach(ss, cgrp, &tset); } } return retval; @@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp, read_lock(&css_set_lock); newcg = find_existing_css_set(cg, cgrp, template); - if (newcg) - get_css_set(newcg); read_unlock(&css_set_lock); /* doesn't exist at all? */ if (!newcg) return false; /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) { - if (cg_entry->cg == newcg) { - put_css_set(newcg); + list_for_each_entry(cg_entry, newcg_list, links) + if (cg_entry->cg == newcg) return true; - } - } /* not found */ - put_css_set(newcg); return false; } @@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, * @cgrp: the cgroup to attach to * @leader: the threadgroup leader task_struct of the group to be attached * - * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will - * take task_lock of each thread in leader's threadgroup individually in turn. + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take + * task_lock of each thread in leader's threadgroup individually in turn. */ -int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - bool cancel_failed_ss = false; /* guaranteed to be initialized later, but the compiler needs this */ - struct cgroup *oldcgrp = NULL; struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; + struct task_and_cgroup *tc; struct flex_array *group; + struct cgroup_taskset tset = { }; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 0: in order to do expensive, possibly blocking operations for * every thread, we cannot iterate the thread group list, since it needs * rcu or tasklist locked. instead, build an array of all threads in the - * group - threadgroup_fork_lock prevents new threads from appearing, - * and if threads exit, this will just be an over-estimate. + * group - group_rwsem prevents new threads from appearing, and if + * threads exit, this will just be an over-estimate. */ group_size = get_nr_threads(leader); /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(struct task_struct *), group_size, - GFP_KERNEL); + group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ @@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) retval = -EAGAIN; goto out_free_group_list; } - /* take a reference on each task in the group to go in the array. */ + tsk = leader; i = 0; do { + struct task_and_cgroup ent; + + /* @tsk either already exited or can't exit until the end */ + if (tsk->flags & PF_EXITING) + continue; + /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - get_task_struct(tsk); /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + ent.task = tsk; + ent.cgrp = task_cgroup_from_root(tsk, root); + /* nothing to do if this task is already in the cgroup */ + if (ent.cgrp == cgrp) + continue; + retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; + tset.tc_array = group; + tset.tc_array_len = group_size; read_unlock(&tasklist_lock); + /* methods shouldn't be called if no task is actually migrating */ + retval = 0; + if (!group_size) + goto out_free_group_list; + /* * step 1: check that we can legitimately attach to the cgroup. */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, leader); + retval = ss->can_attach(ss, cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; } } - /* a callback to be run on every thread in the threadgroup. */ - if (ss->can_attach_task) { - /* run on each task in the threadgroup. */ - for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - retval = ss->can_attach_task(cgrp, tsk); - if (retval) { - failed_ss = ss; - cancel_failed_ss = true; - goto out_cancel_attach; - } - } - } } /* @@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - /* nothing to do if this task is already in the cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - continue; - /* get old css_set pointer */ - task_lock(tsk); - oldcg = tsk->cgroups; - get_css_set(oldcg); - task_unlock(tsk); - /* see if the new one for us is already in the list? */ - if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { - /* was already there, nothing to do. */ - put_css_set(oldcg); - } else { - /* we don't already have it. get new one. */ + tc = flex_array_get(group, i); + oldcg = tc->task->cgroups; + + /* if we don't already have it in the list get a new one */ + if (!css_set_check_fetched(cgrp, tc->task, oldcg, + &newcg_list)) { retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - put_css_set(oldcg); if (retval) goto out_list_teardown; } } /* - * step 3: now that we're guaranteed success wrt the css_sets, proceed - * to move all tasks to the new cgroup, calling ss->attach_task for each - * one along the way. there are no failure cases after here, so this is - * the commit point. + * step 3: now that we're guaranteed success wrt the css_sets, + * proceed to move all tasks to the new cgroup. There are no + * failure cases after here, so this is the commit point. */ - for_each_subsys(root, ss) { - if (ss->pre_attach) - ss->pre_attach(cgrp); - } for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - /* leave current thread as it is if it's already there */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - continue; - /* if the thread is PF_EXITING, it can just get skipped. */ - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - if (retval == 0) { - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } - } else { - BUG_ON(retval != -ESRCH); - } + tc = flex_array_get(group, i); + retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); + BUG_ON(retval); } /* nothing is sensitive to fork() after this point. */ /* - * step 4: do expensive, non-thread-specific subsystem callbacks. - * TODO: if ever a subsystem needs to know the oldcgrp for each task - * being moved, this call will need to be reworked to communicate that. + * step 4: do subsystem attach callbacks. */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, leader); + ss->attach(ss, cgrp, &tset); } /* @@ -2171,20 +2220,12 @@ out_cancel_attach: /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { - if (ss == failed_ss) { - if (cancel_failed_ss && ss->cancel_attach) - ss->cancel_attach(ss, cgrp, leader); + if (ss == failed_ss) break; - } if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, leader); + ss->cancel_attach(ss, cgrp, &tset); } } - /* clean up the array of referenced threads in the group. */ - for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - put_task_struct(tsk); - } out_free_group_list: flex_array_free(group); return retval; @@ -2192,8 +2233,8 @@ out_free_group_list: /* * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will take - * cgroup_mutex; may take task_lock of task. + * function to attach either it or all tasks in its threadgroup. Will lock + * cgroup_mutex and threadgroup; may take task_lock of task. */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { @@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) * detect it later. */ tsk = tsk->group_leader; - } else if (tsk->flags & PF_EXITING) { - /* optimization for the single-task-only case */ - rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; } - /* * even if we're attaching all tasks in the thread group, we * only need to check permissions on one of them. @@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) get_task_struct(tsk); } - if (threadgroup) { - threadgroup_fork_write_lock(tsk); + threadgroup_lock(tsk); + + if (threadgroup) ret = cgroup_attach_proc(cgrp, tsk); - threadgroup_fork_write_unlock(tsk); - } else { + else ret = cgroup_attach_task(cgrp, tsk); - } + + threadgroup_unlock(tsk); + put_task_struct(tsk); cgroup_unlock(); return ret; @@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); + mutex_unlock(&cgroup_root_mutex); cgroup_unlock(); return 0; } @@ -2585,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_create_file(struct dentry *dentry, mode_t mode, +static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { struct inode *inode; @@ -2626,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - mode_t mode) + umode_t mode) { struct dentry *parent; int error = 0; @@ -2653,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, * returns S_IRUGO if it has only a read handler * returns S_IWUSR if it has only a write hander */ -static mode_t cgroup_file_mode(const struct cftype *cft) +static umode_t cgroup_file_mode(const struct cftype *cft) { - mode_t mode = 0; + umode_t mode = 0; if (cft->mode) return cft->mode; @@ -2678,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp, struct dentry *dir = cgrp->dentry; struct dentry *dentry; int error; - mode_t mode; + umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { @@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void) } void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) + __acquires(css_set_lock) { /* * The first time anyone tries to iterate across a cgroup, @@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, } void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) + __releases(css_set_lock) { read_unlock(&css_set_lock); } @@ -3752,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) * Must be called with the mutex on the parent inode held */ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - mode_t mode) + umode_t mode) { struct cgroup *cgrp; struct cgroupfs_root *root = parent->root; @@ -3846,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct cgroup *c_parent = dentry->d_parent->d_fsdata; @@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. + * it was not made under the protection of RCU, cgroup_mutex or + * threadgroup_change_begin(), so it might no longer be a valid + * cgroup pointer. cgroup_attach_task() might have already changed + * current->cgroups, allowing the previously referenced cgroup + * group to be removed and freed. + * + * Outside the pointer validity we also need to process the css_set + * inheritance between threadgoup_change_begin() and + * threadgoup_change_end(), this way there is no leak in any process + * wide migration performed by cgroup_attach_proc() that could otherwise + * miss a thread because it is too early or too late in the fork stage. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - task_lock(current); + /* + * We don't need to task_lock() current because current->cgroups + * can't be changed concurrently here. The parent obviously hasn't + * exited and called cgroup_exit(), and we are synchronized against + * cgroup migration through threadgroup_change_begin(). + */ child->cgroups = current->cgroups; get_css_set(child->cgroups); - task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } @@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); - task_lock(child); - if (list_empty(&child->cg_list)) + if (list_empty(&child->cg_list)) { + /* + * It's safe to use child->cgroups without task_lock() + * here because we are protected through + * threadgroup_change_begin() against concurrent + * css_set change in cgroup_task_migrate(). Also + * the task can't exit at that point until + * wake_up_new_task() is called, so we are protected + * against cgroup_exit() setting child->cgroup to + * init_css_set. + */ list_add(&child->cg_list, &child->cgroups->tasks); - task_unlock(child); + } write_unlock(&css_set_lock); } } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 213c0351dad8..fc0646b78a64 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -static inline int __cgroup_freezing_or_frozen(struct task_struct *task) +bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state = task_freezer(task)->state; - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); -} + enum freezer_state state; + bool ret; -int cgroup_freezing_or_frozen(struct task_struct *task) -{ - int result; - task_lock(task); - result = __cgroup_freezing_or_frozen(task); - task_unlock(task); - return result; + rcu_read_lock(); + state = task_freezer(task)->state; + ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + rcu_read_unlock(); + + return ret; } /* @@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys; * freezer_can_attach(): * cgroup_mutex (held by caller of can_attach) * - * cgroup_freezing_or_frozen(): - * task->alloc_lock (to get task's cgroup) - * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): * freezer->lock * sighand->siglock (if the cgroup is freezing) @@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys; * write_lock css_set_lock (cgroup iterator start) * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) + * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -150,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, static void freezer_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup) { - kfree(cgroup_freezer(cgroup)); + struct freezer *freezer = cgroup_freezer(cgroup); + + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + kfree(freezer); } /* task is frozen or will freeze immediately when next it gets woken */ @@ -167,13 +166,17 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct cgroup_taskset *tset) { struct freezer *freezer; + struct task_struct *task; /* * Anything frozen can't move or be moved to/from. */ + cgroup_taskset_for_each(task, new_cgroup, tset) + if (cgroup_freezing(task)) + return -EBUSY; freezer = cgroup_freezer(new_cgroup); if (freezer->state != CGROUP_THAWED) @@ -182,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - rcu_read_lock(); - if (__cgroup_freezing_or_frozen(tsk)) { - rcu_read_unlock(); - return -EBUSY; - } - rcu_read_unlock(); - return 0; -} - static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; @@ -220,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) - freeze_task(task, true); + freeze_task(task); spin_unlock_irq(&freezer->lock); } @@ -238,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (is_task_frozen_enough(task)) + if (freezing(task) && is_task_frozen_enough(task)) nfrozen++; } @@ -286,10 +278,9 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; unsigned int num_cant_freeze_now = 0; - freezer->state = CGROUP_FREEZING; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task, true)) + if (!freeze_task(task)) continue; if (is_task_frozen_enough(task)) continue; @@ -307,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - thaw_process(task); - } + while ((task = cgroup_iter_next(cgroup, &it))) + __thaw_task(task); cgroup_iter_end(cgroup, &it); - - freezer->state = CGROUP_THAWED; } static int freezer_change_state(struct cgroup *cgroup, @@ -326,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); update_if_frozen(cgroup, freezer); - if (goal_state == freezer->state) - goto out; switch (goal_state) { case CGROUP_THAWED: + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + freezer->state = CGROUP_THAWED; unfreeze_cgroup(cgroup, freezer); break; case CGROUP_FROZEN: + if (freezer->state == CGROUP_THAWED) + atomic_inc(&system_freezing_cnt); + freezer->state = CGROUP_FREEZING; retval = try_to_freeze_cgroup(cgroup, freezer); break; default: BUG(); } -out: + spin_unlock_irq(&freezer->lock); return retval; @@ -388,10 +380,5 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, - .can_attach_task = freezer_can_attach_task, - .pre_attach = NULL, - .attach_task = NULL, - .attach = NULL, .fork = freezer_fork, - .exit = NULL, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 5ca38d5d238a..2060c6e57027 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -470,7 +470,7 @@ out: cpu_maps_update_done(); } -static int alloc_frozen_cpus(void) +static int __init alloc_frozen_cpus(void) { if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) return -ENOMEM; @@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, } -int cpu_hotplug_pm_sync_init(void) +static int __init cpu_hotplug_pm_sync_init(void) { pm_notifier(cpu_hotplug_pm_callback, 0); return 0; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0b1712dba587..a09ac2b9a661 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1389,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk) -{ - struct cpuset *cs = cgroup_cs(cont); - - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; - - /* - * Kthreads bound to specific cpus cannot be moved to a new cpuset; we - * cannot change their cpu affinity and isolating such threads by their - * set of allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may - * be changed. - */ - if (tsk->flags & PF_THREAD_BOUND) - return -EINVAL; - - return 0; -} - -static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) -{ - return security_task_setscheduler(task); -} - /* * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, attach_task, and attach. + * dynamically allocating them is not allowed in can_attach, and they must + * persist until attach. */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; -/* Set-up work for before attaching each task. */ -static void cpuset_pre_attach(struct cgroup *cont) +/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); + struct task_struct *task; + int ret; + + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + return -ENOSPC; + + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * Kthreads bound to specific cpus cannot be moved to a new + * cpuset; we cannot change their cpu affinity and + * isolating such threads by their set of allowed nodes is + * unnecessary. Thus, cpusets are not applicable for such + * threads. This prevents checking for success of + * set_cpus_allowed_ptr() on all attached tasks before + * cpus_allowed may be changed. + */ + if (task->flags & PF_THREAD_BOUND) + return -EINVAL; + if ((ret = security_task_setscheduler(task))) + return ret; + } + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); -} - -/* Per-thread attachment work. */ -static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) -{ - int err; - struct cpuset *cs = cgroup_cs(cont); - /* - * can_attach beforehand should guarantee that this doesn't fail. - * TODO: have a better way to handle failure here - */ - err = set_cpus_allowed_ptr(tsk, cpus_attach); - WARN_ON_ONCE(err); - - cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, tsk); + return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { struct mm_struct *mm; - struct cpuset *cs = cgroup_cs(cont); - struct cpuset *oldcs = cgroup_cs(oldcont); + struct task_struct *task; + struct task_struct *leader = cgroup_taskset_first(tset); + struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *oldcs = cgroup_cs(oldcgrp); + + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flag(cs, task); + } /* * Change mm, possibly for multiple threads in a threadgroup. This is @@ -1469,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, */ cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; - mm = get_task_mm(tsk); + mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) @@ -1925,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, - .can_attach_task = cpuset_can_attach_task, - .pre_attach = cpuset_pre_attach, - .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, diff --git a/kernel/events/core.c b/kernel/events/core.c index 890eb02c2f21..a8f4ac001a00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ @@ -6941,10 +6941,13 @@ static int __perf_cgroup_move(void *info) return 0; } -static void -perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - task_function_call(task, __perf_cgroup_move, task); + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + task_function_call(task, __perf_cgroup_move, task); } static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, @@ -6958,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_attach_task(cgrp, task); + task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_subsys = { @@ -6967,6 +6970,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach_task = perf_cgroup_attach_task, + .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7f3011c6b57f..6ddaba43fb7a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ diff --git a/kernel/exit.c b/kernel/exit.c index d579a459309d..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@ #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> +#include <linux/writeback.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -888,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1037,9 +1036,12 @@ NORET_TYPE void do_exit(long code) validate_creds_for_do_exit(tsk); preempt_disable(); + if (tsk->nr_dirtied) + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; + tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1049,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1068,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/fork.c b/kernel/fork.c index b058c5820ecd..443f5125f11e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/task.h> + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -972,7 +975,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sched_autogroup_fork(sig); #ifdef CONFIG_CGROUPS - init_rwsem(&sig->threadgroup_fork_lock); + init_rwsem(&sig->group_rwsem); #endif sig->oom_adj = current->signal->oom_adj; @@ -992,7 +995,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; - clear_freeze_flag(p); } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) @@ -1154,7 +1156,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_lock(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1292,6 +1294,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; /* * Ok, make it visible to the rest of the system. @@ -1369,8 +1372,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_unlock(current); + threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: @@ -1404,7 +1410,7 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_unlock(current); + threadgroup_change_end(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); diff --git a/kernel/freezer.c b/kernel/freezer.c index 7be56c534397..9815b8d1eed5 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -9,101 +9,114 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/kthread.h> -/* - * freezing is complete, mark current process as frozen +/* total number of freezing conditions in effect */ +atomic_t system_freezing_cnt = ATOMIC_INIT(0); +EXPORT_SYMBOL(system_freezing_cnt); + +/* indicate whether PM freezing is in effect, protected by pm_mutex */ +bool pm_freezing; +bool pm_nosig_freezing; + +/* protects freezing and frozen transitions */ +static DEFINE_SPINLOCK(freezer_lock); + +/** + * freezing_slow_path - slow path for testing whether a task needs to be frozen + * @p: task to be tested + * + * This function is called by freezing() if system_freezing_cnt isn't zero + * and tests whether @p needs to enter and stay in frozen state. Can be + * called under any context. The freezers are responsible for ensuring the + * target tasks see the updated state. */ -static inline void frozen_process(void) +bool freezing_slow_path(struct task_struct *p) { - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - smp_wmb(); - } - clear_freeze_flag(current); + if (p->flags & PF_NOFREEZE) + return false; + + if (pm_nosig_freezing || cgroup_freezing(p)) + return true; + + if (pm_freezing && !(p->flags & PF_KTHREAD)) + return true; + + return false; } +EXPORT_SYMBOL(freezing_slow_path); /* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) +bool __refrigerator(bool check_kthr_stop) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ - long save; + bool was_frozen = false; + long save = current->state; - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - /* prevent accounting of that task to load */ - current->flags |= PF_FREEZING; - for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) + + spin_lock_irq(&freezer_lock); + current->flags |= PF_FROZEN; + if (!freezing(current) || + (check_kthr_stop && kthread_should_stop())) + current->flags &= ~PF_FROZEN; + spin_unlock_irq(&freezer_lock); + + if (!(current->flags & PF_FROZEN)) break; + was_frozen = true; schedule(); } - /* Remove the accounting blocker */ - current->flags &= ~PF_FREEZING; - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); + + /* + * Restore saved task state before returning. The mb'd version + * needs to be used; otherwise, it might silently break + * synchronization which depends on ordered task state change. + */ + set_current_state(save); + + return was_frozen; } -EXPORT_SYMBOL(refrigerator); +EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + if (lock_task_sighand(p, &flags)) { + signal_wake_up(p, 0); + unlock_task_sighand(p, &flags); + } } /** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * + * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE + * flag and either sending a fake signal to it or waking it up, depending + * on whether it has %PF_FREEZER_NOSIG set. * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. + * RETURNS: + * %false, if @p is not freezing or already frozen; %true, otherwise */ -bool freeze_task(struct task_struct *p, bool sig_only) +bool freeze_task(struct task_struct *p) { - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - smp_rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; + unsigned long flags; + + spin_lock_irqsave(&freezer_lock, flags); + if (!freezing(p) || frozen(p)) { + spin_unlock_irqrestore(&freezer_lock, flags); + return false; } - if (should_send_signal(p)) { + if (!(p->flags & PF_KTHREAD)) { fake_signal_wake_up(p); /* * fake_signal_wake_up() goes through p's scheduler @@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only) * TASK_RUNNING transition can't race with task state * testing in try_to_freeze_tasks(). */ - } else if (sig_only) { - return false; } else { wake_up_state(p, TASK_INTERRUPTIBLE); } + spin_unlock_irqrestore(&freezer_lock, flags); return true; } -void cancel_freezing(struct task_struct *p) +void __thaw_task(struct task_struct *p) { unsigned long flags; - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - -static int __thaw_process(struct task_struct *p) -{ - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - return 1; - } - clear_freeze_flag(p); - return 0; + /* + * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to + * be visible to @p as waking up implies wmb. Waking up inside + * freezer_lock also prevents wakeups from leaking outside + * refrigerator. + */ + spin_lock_irqsave(&freezer_lock, flags); + if (frozen(p)) + wake_up_process(p); + spin_unlock_irqrestore(&freezer_lock, flags); } -/* - * Wake up a frozen process +/** + * set_freezable - make %current freezable * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. + * Mark %current freezable and enter refrigerator if necessary. */ -int thaw_process(struct task_struct *p) +bool set_freezable(void) { - task_lock(p); - if (__thaw_process(p) == 1) { - task_unlock(p); - wake_up_process(p); - return 1; - } - task_unlock(p); - return 0; + might_sleep(); + + /* + * Modify flags while holding freezer_lock. This ensures the + * freezer notices that we aren't frozen yet or the freezing + * condition is visible to try_to_freeze() below. + */ + spin_lock_irq(&freezer_lock); + current->flags &= ~PF_NOFREEZE; + spin_unlock_irq(&freezer_lock); + + return try_to_freeze(); } -EXPORT_SYMBOL(thaw_process); +EXPORT_SYMBOL(set_freezable); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7ca523b249ef..1f9e26526b69 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, return -EINVAL; if (intsize < 1) return -EINVAL; + if (d->nr_irq && ((intspec[0] < d->hwirq_base) || + (intspec[0] >= d->hwirq_base + d->nr_irq))) + return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1da999f5e746..a9a9dbe49fea 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1292,7 +1292,7 @@ EXPORT_SYMBOL(free_irq); * and to set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler ist + * then you need to supply @handler and @thread_fn. @handler is * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 30c3c7708132..01d3b70fc98a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key) atomic_inc(&key->enabled); jump_label_unlock(); } +EXPORT_SYMBOL_GPL(jump_label_inc); static void __jump_label_dec(struct jump_label_key *key, unsigned long rate_limit, struct delayed_work *work) @@ -86,6 +87,7 @@ static void __jump_label_dec(struct jump_label_key *key, jump_label_unlock(); } +EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { diff --git a/kernel/kexec.c b/kernel/kexec.c index dc7bc0829286..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> -#include <linux/kmsg_dump.h> #include <linux/syscore_ops.h> #include <asm/page.h> @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); @@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; goto unlock; } @@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size) if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); crash_unmap_reserved_pages(); unlock: @@ -1523,7 +1534,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - mutex_lock(&pm_mutex); + lock_system_sleep(); pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1576,7 +1587,7 @@ int kernel_kexec(void) thaw_processes(); Restore_console: pm_restore_console(); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index a4bea97c75b6..a0a88543934e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,6 +36,7 @@ #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> +#include <linux/rwsem.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq; static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES @@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work) * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. */ static int usermodehelper_disabled = 1; @@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1; static atomic_t running_helpers = ATOMIC_INIT(0); /* - * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_pm_callback() fails + * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) +void read_lock_usermodehelper(void) +{ + down_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_lock_usermodehelper); + +void read_unlock_usermodehelper(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); + /** * usermodehelper_disable - prevent new helpers from being started */ @@ -300,8 +315,10 @@ int usermodehelper_disable(void) { long retval; + down_write(&umhelper_sem); usermodehelper_disabled = 1; - smp_mb(); + up_write(&umhelper_sem); + /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to @@ -314,7 +331,9 @@ int usermodehelper_disable(void) if (retval) return 0; + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); return -EAGAIN; } @@ -323,7 +342,9 @@ int usermodehelper_disable(void) */ void usermodehelper_enable(void) { + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); } /** diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823b..95dd7212e610 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) diff --git a/kernel/kthread.c b/kernel/kthread.c index b6d216a92639..3d3de633702e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -59,6 +59,31 @@ int kthread_should_stop(void) EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_freezable_should_stop - should this freezable kthread return now? + * @was_frozen: optional out parameter, indicates whether %current was frozen + * + * kthread_should_stop() for freezable kthreads, which will enter + * refrigerator if necessary. This function is safe from kthread_stop() / + * freezer deadlock and freezable kthreads should use this function instead + * of calling try_to_freeze() directly. + */ +bool kthread_freezable_should_stop(bool *was_frozen) +{ + bool frozen = false; + + might_sleep(); + + if (unlikely(freezing(current))) + frozen = __refrigerator(true); + + if (was_frozen) + *was_frozen = frozen; + + return kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); + +/** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * @@ -257,7 +282,7 @@ int kthreadd(void *unused) set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_states[N_HIGH_MEMORY]); - current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; + current->flags |= PF_NOFREEZE; for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c3..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); @@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a6b0503574ee..6d6d28870335 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -43,8 +43,6 @@ int in_suspend __nosavedata; enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, /* keep last */ @@ -55,7 +53,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -static bool freezer_test_done; +bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; @@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) WARN_ON(1); return; } - mutex_lock(&pm_mutex); + lock_system_sleep(); hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } static bool entering_platform_hibernation; @@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void) mdelay(5000); } -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - static int hibernation_test(int level) { if (pm_test_level == level) { @@ -114,7 +103,6 @@ static int hibernation_test(int level) return 0; } #else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ @@ -278,8 +266,7 @@ static int create_image(int platform_mode) goto Platform_finish; error = disable_nonboot_cpus(); - if (error || hibernation_test(TEST_CPUS) - || hibernation_testmode(HIBERNATION_TEST)) + if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); @@ -333,7 +320,7 @@ static int create_image(int platform_mode) */ int hibernation_snapshot(int platform_mode) { - pm_message_t msg = PMSG_RECOVER; + pm_message_t msg; int error; error = platform_begin(platform_mode); @@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode) if (error) goto Cleanup; - if (hibernation_test(TEST_FREEZER) || - hibernation_testmode(HIBERNATION_TESTPROC)) { + if (hibernation_test(TEST_FREEZER)) { /* * Indicate to the caller that we are returning due to a @@ -362,26 +348,26 @@ int hibernation_snapshot(int platform_mode) error = dpm_prepare(PMSG_FREEZE); if (error) { - dpm_complete(msg); + dpm_complete(PMSG_RECOVER); goto Cleanup; } suspend_console(); pm_restrict_gfp_mask(); + error = dpm_suspend(PMSG_FREEZE); - if (error) - goto Recover_platform; - if (hibernation_test(TEST_DEVICES)) - goto Recover_platform; + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); - error = create_image(platform_mode); /* - * Control returns here (1) after the image has been created or the + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the * image creation has failed and (2) after a successful restore. */ - Resume_devices: /* We may need to release the preallocated image pages here. */ if (error || !in_suspend) swsusp_free(); @@ -399,10 +385,6 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; - Recover_platform: - platform_recover(platform_mode); - goto Resume_devices; - Cleanup: swsusp_free(); goto Close; @@ -590,9 +572,6 @@ int hibernation_platform_enter(void) static void power_down(void) { switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; case HIBERNATION_REBOOT: kernel_restart(NULL); break; @@ -611,17 +590,6 @@ static void power_down(void) while(1); } -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - /** * hibernate - Carry out system hibernation, including saving the image. */ @@ -629,7 +597,7 @@ int hibernate(void) { int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -654,7 +622,7 @@ int hibernate(void) sys_sync(); printk("done.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) goto Finish; @@ -697,7 +665,7 @@ int hibernate(void) pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -811,11 +779,13 @@ static int software_resume(void) goto close_finish; error = create_basic_memory_bitmaps(); - if (error) + if (error) { + usermodehelper_enable(); goto close_finish; + } pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) { swsusp_close(FMODE_READ); goto Done; @@ -855,8 +825,6 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", }; /* @@ -865,17 +833,15 @@ static const char * const hibernation_modes[] = { * Hibernation can be handled in several ways. There are a few different ways * to put the system into the sleep state: using the platform driver (e.g. ACPI * or other hibernation_ops), powering it off or rebooting it (for testing - * mostly), or using one of the two available test modes. + * mostly). * * The sysfs file /sys/power/disk provides an interface for selecting the * hibernation mode to use. Reading from this file causes the available modes - * to be printed. There are 5 modes that can be supported: + * to be printed. There are 3 modes that can be supported: * * 'platform' * 'shutdown' * 'reboot' - * 'test' - * 'testproc' * * If a platform hibernation driver is in use, 'platform' will be supported * and will be used by default. Otherwise, 'shutdown' will be used by default. @@ -899,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -929,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -941,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: @@ -957,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pr_debug("PM: Hibernation mode set to '%s'\n", hibernation_modes[mode]); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } @@ -984,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (maj != MAJOR(res) || min != MINOR(res)) goto out; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_resume_device = res; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 36e0f0903c32..9824b41e5a18 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * + * * This file is released under the GPLv2 * */ @@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } @@ -240,7 +240,7 @@ struct kobject *power_kobj; * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and * 'disk' (Suspend-to-Disk). * - * store() accepts one of those strings, translates it into the + * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, /* First, check if we are requested to hibernate */ if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); - goto Exit; + goto Exit; } #ifdef CONFIG_SUSPEND diff --git a/kernel/power/power.h b/kernel/power/power.h index 23a2db1ec442..0c4defe6d3b8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) /* kernel/power/hibernate.c */ +extern bool freezer_test_done; + extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); diff --git a/kernel/power/process.c b/kernel/power/process.c index addbbe5531bc..77274c9ba2f1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,16 +22,7 @@ */ #define TIMEOUT (20 * HZ) -static inline int freezable(struct task_struct * p) -{ - if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) - return 0; - return 1; -} - -static int try_to_freeze_tasks(bool sig_only) +static int try_to_freeze_tasks(bool user_only) { struct task_struct *g, *p; unsigned long end_time; @@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only) end_time = jiffies + TIMEOUT; - if (!sig_only) + if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezable(p)) - continue; - - if (!freeze_task(p, sig_only)) + if (p == current || !freeze_task(p)) continue; /* @@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only) } while_each_thread(g, p); read_unlock(&tasklist_lock); - if (!sig_only) { + if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } @@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs = elapsed_csecs64; if (todo) { - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ printk("\n"); printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", @@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - thaw_workqueues(); - read_lock(&tasklist_lock); do_each_thread(g, p) { - task_lock(p); - if (!wakeup && freezing(p) && !freezer_should_skip(p)) + if (!wakeup && !freezer_should_skip(p) && + p != current && freezing(p) && !frozen(p)) sched_show_task(p); - cancel_freezing(p); - task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } else { @@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; + if (!pm_freezing) + atomic_inc(&system_freezing_cnt); + printk("Freezing user space processes ... "); + pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) { printk("done."); @@ -150,17 +135,22 @@ int freeze_processes(void) printk("\n"); BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_kernel_threads(void) { int error; printk("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) printk("done."); @@ -168,37 +158,32 @@ int freeze_kernel_threads(void) printk("\n"); BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } -static void thaw_tasks(bool nosig_only) +void thaw_processes(void) { struct task_struct *g, *p; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!freezable(p)) - continue; + if (pm_freezing) + atomic_dec(&system_freezing_cnt); + pm_freezing = false; + pm_nosig_freezing = false; - if (nosig_only && should_send_signal(p)) - continue; + oom_killer_enable(); + + printk("Restarting tasks ... "); - if (cgroup_freezing_or_frozen(p)) - continue; + thaw_workqueues(); - thaw_process(p); + read_lock(&tasklist_lock); + do_each_thread(g, p) { + __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -} -void thaw_processes(void) -{ - oom_killer_enable(); - - printk("Restarting tasks ... "); - thaw_workqueues(); - thaw_tasks(true); - thaw_tasks(false); schedule(); printk("done.\n"); } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c1441392..1cf88900ec4f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4953dc054c53..4fd51beed879 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops; */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - mutex_lock(&pm_mutex); + lock_system_sleep(); suspend_ops = ops; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); @@ -106,13 +106,11 @@ static int suspend_prepare(void) goto Finish; error = suspend_freeze_processes(); - if (error) { - suspend_stats.failed_freeze++; - dpm_save_failed_step(SUSPEND_FREEZE); - } else + if (!error) return 0; - suspend_thaw_processes(); + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11a594c4ba25..3739ecced085 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -18,7 +18,6 @@ #include <linux/bitops.h> #include <linux/genhd.h> #include <linux/device.h> -#include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/swap.h> diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535c2b88..6b1ab7a88522 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -21,6 +21,7 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> +#include <linux/compat.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> @@ -30,28 +31,6 @@ #include "power.h" -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future. They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE 1 -#define PMOPS_ENTER 2 -#define PMOPS_FINISH 3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones. They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) - #define SNAPSHOT_MINOR 231 @@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->platform_support = 0; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_free(); free_basic_memory_bitmaps(); @@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return 0; } @@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } @@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; @@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } -static void snapshot_deprecated_ioctl(unsigned int cmd) -{ - if (printk_ratelimit()) - printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " - "be removed soon, update your suspend-to-disk " - "utilities\n", - __builtin_return_address(0), cmd); -} - static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; error = freeze_processes(); - if (error) { - thaw_processes(); + if (error) usermodehelper_enable(); - } - if (!error) + else data->frozen = 1; break; @@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->frozen = 0; break; - case SNAPSHOT_ATOMIC_SNAPSHOT: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; @@ -283,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) + if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error) - data->ready = 1; + if (!error && !freezer_test_done) + data->ready = 1; + if (freezer_test_done) { + freezer_test_done = false; + thaw_processes(); + } + } break; case SNAPSHOT_ATOMIC_RESTORE: @@ -305,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->ready = 0; break; - case SNAPSHOT_SET_IMAGE_SIZE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; @@ -321,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_AVAIL_SWAP: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_GET_SWAP_PAGE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; @@ -353,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_all_swap_pages(data->swap); break; - case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - if (!swsusp_swap_in_use()) { - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), - 0, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } else { - error = -EPERM; - } - break; - case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; @@ -396,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_platform_enter(); break; - case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - error = -EINVAL; - - switch (arg) { - - case PMOPS_PREPARE: - data->platform_support = 1; - error = 0; - break; - - case PMOPS_ENTER: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case PMOPS_FINISH: - if (data->platform_support) - error = 0; - break; - - default: - printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - - } - break; - case SNAPSHOT_SET_SWAP_AREA: if (swsusp_swap_in_use()) { error = -EPERM; @@ -464,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, return error; } +#ifdef CONFIG_COMPAT + +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); + + switch (cmd) { + case SNAPSHOT_GET_IMAGE_SIZE: + case SNAPSHOT_AVAIL_SWAP_SIZE: + case SNAPSHOT_ALLOC_SWAP_PAGE: { + compat_loff_t __user *uoffset = compat_ptr(arg); + loff_t offset; + mm_segment_t old_fs; + int err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, cmd, (unsigned long) &offset); + set_fs(old_fs); + if (!err && put_user(offset, uoffset)) + err = -EFAULT; + return err; + } + + case SNAPSHOT_CREATE_IMAGE: + return snapshot_ioctl(file, cmd, + (unsigned long) compat_ptr(arg)); + + case SNAPSHOT_SET_SWAP_AREA: { + struct compat_resume_swap_area __user *u_swap_area = + compat_ptr(arg); + struct resume_swap_area swap_area; + mm_segment_t old_fs; + int err; + + err = get_user(swap_area.offset, &u_swap_area->offset); + err |= get_user(swap_area.dev, &u_swap_area->dev); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, + (unsigned long) &swap_area); + set_fs(old_fs); + return err; + } + + default: + return snapshot_ioctl(file, cmd, arg); + } +} + +#endif /* CONFIG_COMPAT */ + static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, @@ -471,6 +448,9 @@ static const struct file_operations snapshot_fops = { .write = snapshot_write, .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = snapshot_compat_ioctl, +#endif }; static struct miscdevice snapshot_device = { diff --git a/kernel/relay.c b/kernel/relay.c index 226fade4d727..4335e1d7ee2d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf, */ static struct dentry *create_buf_file_default_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 34683efa2cce..6d269cce7aa1 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf, return 0; } - /* FIXME - make memparse() take const char* args */ - *res = memparse((char *)buf, &end); + *res = memparse(buf, &end); if (*end != '\0') return -EINVAL; diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 3d9f31cd79e7..98ec49475460 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,11 +6,11 @@ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * */ +#include <linux/device.h> #include <linux/kthread.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/spinlock.h> -#include <linux/sysdev.h> #include <linux/timer.h> #include <linux/freezer.h> @@ -27,7 +27,7 @@ struct test_thread_data { int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; int event; - struct sys_device sysdev; + struct device dev; }; static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; @@ -271,7 +271,7 @@ static int test_func(void *data) * * opcode:data */ -static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct sched_param schedpar; @@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut char cmdbuf[32]; int op, dat, tid, ret; - td = container_of(dev, struct test_thread_data, sysdev); - tid = td->sysdev.id; + td = container_of(dev, struct test_thread_data, dev); + tid = td->dev.id; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(cmdbuf)) @@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut * @dev: thread to query * @buf: char buffer to be filled with thread status info */ -static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, char *buf) { struct test_thread_data *td; @@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute char *curr = buf; int i; - td = container_of(dev, struct test_thread_data, sysdev); - tsk = threads[td->sysdev.id]; + td = container_of(dev, struct test_thread_data, dev); + tsk = threads[td->dev.id]; spin_lock(&rttest_lock); @@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_unlock(&rttest_lock); curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->sysdev.id].owner); + mutexes[td->dev.id].owner); return curr - buf; } -static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); -static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); +static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); -static struct sysdev_class rttest_sysclass = { +static struct bus_type rttest_subsys = { .name = "rttest", + .dev_name = "rttest", }; static int init_test_thread(int id) { - thread_data[id].sysdev.cls = &rttest_sysclass; - thread_data[id].sysdev.id = id; + thread_data[id].dev.bus = &rttest_subsys; + thread_data[id].dev.id = id; threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); if (IS_ERR(threads[id])) return PTR_ERR(threads[id]); - return sysdev_register(&thread_data[id].sysdev); + return device_register(&thread_data[id].dev); } static int init_rttest(void) @@ -393,7 +394,7 @@ static int init_rttest(void) for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) rt_mutex_init(&mutexes[i]); - ret = sysdev_class_register(&rttest_sysclass); + ret = subsys_system_register(&rttest_subsys, NULL); if (ret) return ret; @@ -401,10 +402,10 @@ static int init_rttest(void) ret = init_test_thread(i); if (ret) break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + ret = device_create_file(&thread_data[i].dev, &dev_attr_status); if (ret) break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + ret = device_create_file(&thread_data[i].dev, &dev_attr_command); if (ret) break; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4dbfd04a2148..fd7b25e90079 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5176,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; @@ -6675,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *page) +static ssize_t sched_mc_power_savings_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - return sprintf(page, "%u\n", sched_mc_power_savings); + return sprintf(buf, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, - struct sysdev_class_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); +static DEVICE_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, - struct sysdev_class_attribute *attr, - char *page) +static ssize_t sched_smt_power_savings_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - return sprintf(page, "%u\n", sched_smt_power_savings); + return sprintf(buf, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, - struct sysdev_class_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, +static DEVICE_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif -int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct device *dev) { int err = 0; #ifdef CONFIG_SCHED_SMT if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); + err = device_create_file(dev, &dev_attr_sched_smt_power_savings); #endif #ifdef CONFIG_SCHED_MC if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); + err = device_create_file(dev, &dev_attr_sched_mc_power_savings); #endif return err; } @@ -7136,10 +7134,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_RT_GROUP_SCHED -#else /* !CONFIG_RT_GROUP_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); @@ -7248,9 +7242,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED -#endif - #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { @@ -7565,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) sched_destroy_group(tg); } -static int -cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) - return -EINVAL; + if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + return -EINVAL; #else - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; + /* We don't support RT-tasks being in separate groups */ + if (task->sched_class != &fair_sched_class) + return -EINVAL; #endif + } return 0; } -static void -cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - sched_move_task(tsk); + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + sched_move_task(task); } static void @@ -7917,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach_task = cpu_cgroup_can_attach_task, - .attach_task = cpu_cgroup_attach_task, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e42de9105f8..84adb2d66cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) } #define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? @@ -4508,7 +4510,9 @@ redo: goto out_balanced; if (lb_flags & LBF_NEED_BREAK) { - lb_flags &= ~LBF_NEED_BREAK; + lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (lb_flags & LBF_ABORT) + goto out_balanced; goto redo; } diff --git a/kernel/signal.c b/kernel/signal.c index 56ce3a618b28..c73c4284160e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,6 +28,7 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> +#include <linux/user_namespace.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, + struct user_namespace *ns) +{ + return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + if (current_user_ns() == task_cred_xxx(t, user_ns)) + return; + + if (SI_FROMKERNEL(info)) + return; + + info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), + current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + return; +} +#endif + static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, int group, int from_ancestor_ns) { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_pid = 0; break; } + + userns_fixup_signal_uid(&q->info, t); + } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(tsk->parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; + rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = task_uid(current->parent); + info->si_uid = map_cred_ns(__task_cred(current->parent), + current_user_ns()); + rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ @@ -2318,6 +2355,27 @@ relock: return signr; } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ + sigset_t blocked; + + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, signr); + set_current_blocked(&blocked); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take @@ -2355,8 +2413,15 @@ void exit_signals(struct task_struct *tsk) int group_stop = 0; sigset_t unblocked; + /* + * @tsk is about to have PF_EXITING set - lock out users which + * expect stable threadgroup. + */ + threadgroup_change_begin(tsk); + if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; + threadgroup_change_end(tsk); return; } @@ -2366,6 +2431,9 @@ void exit_signals(struct task_struct *tsk) * see wants_signal(), do_signal_stop(). */ tsk->flags |= PF_EXITING; + + threadgroup_change_end(tsk); + if (!signal_pending(tsk)) goto out; diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f8..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae2719643854..f487f257e05e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "bootloader_type", .data = &bootloader_type, diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b26c2228fe92..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -25,7 +25,7 @@ config HIGH_RES_TIMERS config GENERIC_CLOCKEVENTS_BUILD bool default y - depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + depends on GENERIC_CLOCKEVENTS config GENERIC_CLOCKEVENTS_MIN_ADJUST bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6c..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -17,7 +17,6 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/smp.h> -#include <linux/sysdev.h> #include "tick-internal.h" diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d3ad022136e5..a45ca167ab24 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,8 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#include <linux/device.h> #include <linux/clocksource.h> -#include <linux/sysdev.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ @@ -796,8 +796,8 @@ EXPORT_SYMBOL(clocksource_unregister); * Provides sysfs interface for listing current clocksource. */ static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +sysfs_show_current_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { ssize_t count = 0; @@ -818,8 +818,8 @@ sysfs_show_current_clocksources(struct sys_device *dev, * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sysfs_override_clocksource(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { size_t ret = count; @@ -853,8 +853,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Provides sysfs interface for listing registered clocksources */ static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, +sysfs_show_available_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { struct clocksource *src; @@ -883,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, /* * Sysfs setup bits: */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0444, +static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); -static struct sysdev_class clocksource_sysclass = { +static struct bus_type clocksource_subsys = { .name = "clocksource", + .dev_name = "clocksource", }; -static struct sys_device device_clocksource = { +static struct device device_clocksource = { .id = 0, - .cls = &clocksource_sysclass, + .bus = &clocksource_subsys, }; static int __init init_clocksource_sysfs(void) { - int error = sysdev_class_register(&clocksource_sysclass); + int error = subsys_system_register(&clocksource_subsys, NULL); if (!error) - error = sysdev_register(&device_clocksource); + error = device_register(&device_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_current_clocksource); + &dev_attr_current_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_available_clocksource); + &dev_attr_available_clocksource); return error; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 16fc34a0806f..cdea7b56b0c9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry) static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91dc4bc8bf72..a3f1bc5d2a00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4438,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = { }; struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2c2657462ac3..b93ecbadad6d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,7 +312,7 @@ void tracing_reset_current(int cpu); void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad0a810..bec7b5b53e03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -242,10 +242,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ - const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + char name[]; /* I: workqueue name */ }; struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, - const char *lock_name) + const char *lock_name, ...) { + va_list args, args1; struct workqueue_struct *wq; unsigned int cpu; + size_t namelen; + + /* determine namelen, allocate wq and format name */ + va_start(args, lock_name); + va_copy(args1, args); + namelen = vsnprintf(NULL, 0, fmt, args) + 1; + + wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + if (!wq) + goto err; + + vsnprintf(wq->name, namelen, fmt, args1); + va_end(args); + va_end(args1); /* * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, flags |= WQ_HIGHPRI; max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, name); - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - goto err; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); - wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + rescuer->task = kthread_create(rescuer_thread, wq, "%s", + wq->name); if (IS_ERR(rescuer->task)) goto err; |