diff options
Diffstat (limited to 'kernel')
51 files changed, 1968 insertions, 1647 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c072b6da239..05949c0510c5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,11 +127,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(obj)/time.o: $(obj)/timeconst.h -quiet_cmd_timeconst = TIMEC $@ - cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + targets += timeconst.h -$(obj)/timeconst.h: $(src)/timeconst.pl FORCE - $(call if_changed,timeconst) +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) ifeq ($(CONFIG_MODULE_SIG),y) # @@ -153,23 +161,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates # fail and that the kernel may be used afterwards. # ############################################################################### -sign_key_with_hash := -ifeq ($(CONFIG_MODULE_SIG_SHA1),y) -sign_key_with_hash := -sha1 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA224),y) -sign_key_with_hash := -sha224 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA256),y) -sign_key_with_hash := -sha256 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA384),y) -sign_key_with_hash := -sha384 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA512),y) -sign_key_with_hash := -sha512 -endif -ifeq ($(sign_key_with_hash),) +ifndef CONFIG_MODULE_SIG_HASH $(error Could not determine digest type to use from kernel config) endif @@ -182,8 +174,8 @@ signing_key.priv signing_key.x509: x509.genkey @echo "### needs to be run as root, and uses a hardware random" @echo "### number generator if one is available." @echo "###" - openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ - -x509 -config x509.genkey \ + openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ + -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ -keyout signing_key.priv @echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index e8b1627ab9c7..b9bd7f098ee5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { + if (!S_ISREG(file_inode(file)->i_mode)) { filp_close(file, NULL); return -EACCES; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798fd..fb2fb11fbb25 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,7 @@ #include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> -#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> @@ -376,22 +376,18 @@ static int css_set_count; * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { int i; - int index; - unsigned long tmp = 0UL; + unsigned long key = 0UL; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; + key += (unsigned long)css[i]; + key = (key >> 16) ^ key; - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; + return key; } /* We don't maintain the lists running through each css_set to its @@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); + hash_del(&cg->hlist); css_set_count--; list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) struct cgroup *cgrp = link->cgrp; list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); + + /* + * We may not be holding cgroup_mutex, and if cgrp->count is + * dropped to 0 the cgroup can be destroyed at any time, hence + * rcu_read_lock is used to keep it alive. + */ + rcu_read_lock(); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + rcu_read_unlock(); kfree(link); } @@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; struct hlist_node *node; struct css_set *cg; + unsigned long key; /* * Build the set of subsystem state objects that we want to see in the @@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set( } } - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { + key = css_set_hash(template); + hash_for_each_possible(css_set_table, cg, node, hlist, key) { if (!compare_css_sets(cg, oldcg, cgrp, template)) continue; @@ -657,8 +661,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; - struct hlist_head *hhead; struct cg_cgroup_link *link; + unsigned long key; /* First see if we already have a cgroup group that matches * the desired set */ @@ -704,8 +708,8 @@ static struct css_set *find_css_set( css_set_count++; /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); + key = css_set_hash(res->subsys); + hash_add(css_set_table, &res->hlist, key); write_unlock(&css_set_lock); @@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static void cgroup_free_fn(struct work_struct *work) { - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; - BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ - synchronize_rcu(); + struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->css_free(cgrp); + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) + ss->css_free(cgrp); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); - /* - * Drop the active superblock reference that we took when we - * created the cgroup - */ - deactivate_super(cgrp->root->sb); + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ + deactivate_super(cgrp->root->sb); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); - simple_xattrs_free(&cgrp->xattrs); + simple_xattrs_free(&cgrp->xattrs); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - kfree_rcu(cgrp, rcu_head); + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + kfree(cgrp); +} + +static void cgroup_free_rcu(struct rcu_head *head) +{ + struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + + schedule_work(&cgrp->free_work); +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cgroup */ + if (S_ISDIR(inode->i_mode)) { + struct cgroup *cgrp = dentry->d_fsdata; + + BUG_ON(!(cgroup_is_removed(cgrp))); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; @@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d) dput(parent); } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { struct cfent *cfe; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); + /* + * If we're doing cleanup due to failure of cgroup_create(), + * the corresponding @cfe may not exist. + */ list_for_each_entry(cfe, &cgrp->files, node) { struct dentry *d = cfe->dentry; @@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) list_del_init(&cfe->node); dput(d); - return 0; + break; } - return -ENOENT; } /** @@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; - synchronize_rcu(); return 0; } @@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); + INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); @@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; + struct hlist_node *node; + struct css_set *cg; BUG_ON(sb->s_root != NULL); @@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); - } + hash_for_each(css_set_table, i, node, cg, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), "cgroup_path() called without proper locking"); - if (!dentry || cgrp == dummytop) { + if (cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) ss->attach(cgrp, &tset); } - synchronize_rcu(); out: if (retval) { for_each_subsys(root, ss) { @@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* * step 5: success! and cleanup */ - synchronize_rcu(); retval = 0; out_put_css_set_refs: if (retval) { @@ -2637,7 +2645,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un */ static inline struct cftype *__file_cft(struct file *file) { - if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + if (file_inode(file)->i_fop != &cgroup_file_operations) return ERR_PTR(-EINVAL); return __d_cft(file->f_dentry); } @@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) continue; - if (is_add) + if (is_add) { err = cgroup_add_file(cgrp, subsys, cft); - else - err = cgroup_rm_file(cgrp, cft); - if (err) { - pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", - is_add ? "add" : "remove", cft->name, err); + if (err) + pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", + cft->name, err); ret = err; + } else { + cgroup_rm_file(cgrp, cft); } } return ret; @@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, } EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos. If there's no descendant, + * @pos is returned. This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ + struct cgroup *last, *tmp; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + last = pos; + /* ->prev isn't RCU safe, walk ->next till the end */ + pos = NULL; + list_for_each_entry_rcu(tmp, &last->children, sibling) + pos = tmp; + } while (pos); + + return last; +} +EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); + static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) { struct cgroup *last; @@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; + remove_wait_queue(event->wqh, &event->wait); + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + eventfd_ctx_put(event->eventfd); kfree(event); dput(cgrp->dentry); @@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - __remove_wait_queue(event->wqh, &event->wait); - spin_lock(&cgrp->event_list_lock); - list_del_init(&event->list); - spin_unlock(&cgrp->event_list_lock); /* - * We are in atomic context, but cgroup_event_remove() may - * sleep, so we have to call it in workqueue. + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. */ - schedule_work(&event->remove); + spin_lock(&cgrp->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); } return 0; @@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { struct cgroup_event *event = NULL; + struct cgroup *cgrp_cfile; unsigned int efd, cfd; struct file *efile = NULL; struct file *cfile = NULL; @@ -3852,7 +3902,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); + ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) goto fail; @@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, goto fail; } + /* + * The file to be monitored must be in the same cgroup as + * cgroup.event_control is. + */ + cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); + if (cgrp_cfile != cgrp) { + ret = -EINVAL; + goto fail; + } + if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto fail; @@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_housekeeping(cgrp); + dentry->d_fsdata = cgrp; + cgrp->dentry = dentry; + cgrp->parent = parent; cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; @@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, lockdep_assert_held(&dentry->d_inode->i_mutex); /* allocation complete, commit to creation */ - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; @@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. Use - * a temporary list to avoid a deadlock with cgroup_event_wake(). Since - * cgroup_event_wake() is called with the wait queue head locked, - * remove_wait_queue() cannot be called while holding event_list_lock. + * directory to avoid race between userspace and kernelspace. */ spin_lock(&cgrp->event_list_lock); - list_splice_init(&cgrp->event_list, &tmp_list); - spin_unlock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &tmp_list, list) { + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { list_del_init(&event->list); - remove_wait_queue(event->wqh, &event->wait); - eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } + spin_unlock(&cgrp->event_list_lock); return 0; } @@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; int i, ret; + struct hlist_node *node, *tmp; + struct css_set *cg; + unsigned long key; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct css_set *cg; - struct hlist_node *node, *tmp; - struct hlist_head *bucket = &css_set_table[i], *new_bucket; - - hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { - /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hlist_del(&cg->hlist); - /* set new value */ - cg->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - new_bucket = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, new_bucket); - } + hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hash_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + key = css_set_hash(cg->subsys); + hash_add(css_set_table, node, key); } write_unlock(&css_set_lock); @@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cg_cgroup_link *link; - struct hlist_head *hhead; BUG_ON(ss->module == NULL); @@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_lock(&css_set_lock); list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { struct css_set *cg = link->cg; + unsigned long key; - hlist_del(&cg->hlist); + hash_del(&cg->hlist); cg->subsys[ss->subsys_id] = NULL; - hhead = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, hhead); + key = css_set_hash(cg->subsys); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); @@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void) list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -4667,7 +4716,7 @@ int __init cgroup_init(void) { int err; int i; - struct hlist_head *hhead; + unsigned long key; err = bdi_init(&cgroup_backing_dev_info); if (err) @@ -4686,8 +4735,8 @@ int __init cgroup_init(void) } /* Add init_css_set to the hash table */ - hhead = css_set_hash(init_css_set.subsys); - hlist_add_head(&init_css_set.hlist, hhead); + key = css_set_hash(init_css_set.subsys); + hash_add(css_set_table, &init_css_set.hlist, key); BUG_ON(!init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - if (cg) - put_css_set_taskexit(cg); + put_css_set_taskexit(cg); } /** @@ -5441,7 +5489,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; - inode = f->f_dentry->d_inode; + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); diff --git a/kernel/compat.c b/kernel/compat.c index 36700e9e2be9..19971d8c7299 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } -asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it) +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) { struct itimerval kit; int error; @@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which, return error; } -asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out) +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) { struct itimerval kin, kout; int error; @@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) memcpy(blocked->sig, &set, sizeof(set)); } -asmlinkage long compat_sys_sigprocmask(int how, - compat_old_sigset_t __user *nset, - compat_old_sigset_t __user *oset) +COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, + compat_old_sigset_t __user *, nset, + compat_old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; @@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, else ret = put_compat_rusage(&ru, uru); if (ret) - return ret; + return -EFAULT; } BUG_ON(info.si_code & __SI_MASK); @@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) { switch (_NSIG_WORDS) { case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); @@ -982,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, - struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize) +void +sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +{ + switch (_NSIG_WORDS) { + case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; + case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; + case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; + case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + } +} + +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { compat_sigset_t s32; sigset_t s; @@ -1013,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } return ret; - -} - -asmlinkage long -compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, - struct compat_siginfo __user *uinfo) -{ - siginfo_t info; - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef __ARCH_WANT_COMPAT_SYS_TIME @@ -1067,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ - sigset_t newset; - compat_sigset_t newset32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&newset, &newset32); - return sigsuspend(&newset); -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ - asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; @@ -1222,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } -#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL -asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, - struct compat_timespec __user *interval) +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) { struct timespec t; int ret; @@ -1237,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, return -EFAULT; return ret; } -#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ /* * Allocate user-space memory for the duration of a single system call, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..4f9dfe43ecbd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,14 +61,6 @@ #include <linux/cgroup.h> /* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - -/* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. @@ -95,18 +87,21 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - struct cpuset *parent; /* my parent */ - struct fmeter fmeter; /* memory_pressure filter */ + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + /* partition number for rebuild_sched_domains() */ int pn; /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset hierarchy */ - struct list_head stack_list; + struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +static inline struct cpuset *parent_cs(const struct cpuset *cs) +{ + struct cgroup *pcgrp = cs->css.cgroup->parent; + + if (pcgrp) + return cgroup_cs(pcgrp); + return NULL; +} + #ifdef CONFIG_NUMA static inline bool task_has_mempolicy(struct task_struct *task) { @@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) /* bits in struct cpuset flags field */ typedef enum { + CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -147,13 +152,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; -/* the type of hotplug event */ -enum hotplug_event { - CPUSET_CPU_OFFLINE, - CPUSET_MEM_OFFLINE, -}; - /* convenient tests for these bits */ +static inline bool is_cpuset_online(const struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags); +} + static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); @@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | + (1 << CS_MEM_EXCLUSIVE)), }; +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_cgrp: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ + cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ + if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_cgrp by calling + * cgroup_rightmost_descendant() to skip subtree. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ + cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ + if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) + /* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. - * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. + * There are two global mutexes guarding cpuset structures - cpuset_mutex + * and callback_mutex. The latter may nest inside the former. We also + * require taking task_lock() when dereferencing a task's cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both mutexes to modify cpusets. If a task holds + * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * is the only task able to also acquire callback_mutex and be able to + * modify cpusets. It can perform various checks on the cpuset structure + * first, knowing nothing will change. It can also allocate memory while + * just holding cpuset_mutex. While it is performing these checks, various + * callback routines can briefly acquire callback_mutex to query cpusets. + * Once it is ready to make the changes, it takes callback_mutex, blocking + * everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex @@ -232,6 +261,7 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ +static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); /* @@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); /* + * CPU / memory hotplug is handled asynchronously. + */ +static struct workqueue_struct *cpuset_propagate_hotplug_wq; + +static void cpuset_hotplug_workfn(struct work_struct *work); +static void cpuset_propagate_hotplug_workfn(struct work_struct *work); +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) - cs = cs->parent; + cs = parent_cs(cs); if (cs) cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else @@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) - cs = cs->parent; + cs = parent_cs(cs); if (cs) nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); @@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cgroup_mutex held + * Called with callback_mutex/cpuset_mutex held */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { struct cgroup *cont; struct cpuset *c, *par; + int ret; + + rcu_read_lock(); /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } + ret = -EBUSY; + cpuset_for_each_child(c, cont, cur) + if (!is_cpuset_subset(c, trial)) + goto out; /* Remaining checks don't apply to root cpuset */ + ret = 0; if (cur == &top_cpuset) - return 0; + goto out; - par = cur->parent; + par = parent_cs(cur); /* We must be a subset of our parent cpuset */ + ret = -EACCES; if (!is_cpuset_subset(trial, par)) - return -EACCES; + goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); + ret = -EINVAL; + cpuset_for_each_child(c, cont, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; + goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; + goto out; } - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } + /* + * Cpusets with tasks - existing or newly being attached - can't + * have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && + (cpumask_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed))) + goto out; - return 0; + ret = 0; +out: + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SMP @@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) return; } -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) { - LIST_HEAD(q); - - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); + struct cpuset *cp; + struct cgroup *pos_cgrp; - if (cpumask_empty(cp->cpus_allowed)) + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); continue; + } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } } + rcu_read_unlock(); } /* @@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cgroup_lock held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup *pos_cgrp; doms = NULL; dattr = NULL; @@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; csn = 0; - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; + if (!cpumask_empty(cp->cpus_allowed) && + !is_sched_load_balance(cp)) continue; - } - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } + if (is_sched_load_balance(cp)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + } + rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; @@ -725,25 +763,25 @@ done: /* * Rebuild scheduler domains. * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + lockdep_assert_held(&cpuset_mutex); get_online_cpus(); /* Generate domain masks and attrs */ - cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); @@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) put_online_cpus(); } #else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { } @@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains, } #endif /* CONFIG_SMP */ -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ void rebuild_sched_domains(void) { - do_rebuild_sched_domains(NULL); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -808,7 +813,7 @@ void rebuild_sched_domains(void) * @tsk: task to test * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Called for each task in a cgroup by cgroup_scan_tasks(). * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). @@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, * cpus_allowed mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, heap_free(&heap); if (is_load_balanced) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); return 0; } @@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cgroup_mutex, so current's cpuset won't change + * Call holding cpuset_mutex, so current's cpuset won't change * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing @@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. + * memory_migrate flag is set. Called with cpuset_mutex held. */ static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) @@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cgroup_mutex */ + static nodemask_t newmems; /* protected by cpuset_mutex */ cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, &newmems); @@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound; * @oldmem: old mems_allowed of cpuset cs * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */ @@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cgroup_mutex, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); } return 0; @@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk, * @cs: the cpuset in which each task's spread flags needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cgroup_mutex held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs, &heap); @@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; int ret; + mutex_lock(&cpuset_mutex); + + ret = -ENOSPC; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; + goto out_unlock; cgroup_taskset_for_each(task, cgrp, tset) { /* @@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * set_cpus_allowed_ptr() on all attached tasks before * cpus_allowed may be changed. */ + ret = -EINVAL; if (task->flags & PF_THREAD_BOUND) - return -EINVAL; - if ((ret = security_task_setscheduler(task))) - return ret; + goto out_unlock; + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; } - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; + ret = 0; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; +} - return 0; +static void cpuset_cancel_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + mutex_lock(&cpuset_mutex); + cgroup_cs(cgrp)->attach_in_progress--; + mutex_unlock(&cpuset_mutex); } +/* + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +static cpumask_var_t cpus_attach; + static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + /* static bufs protected by cpuset_mutex */ + static nodemask_t cpuset_attach_nodemask_from; + static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); @@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + mutex_lock(&cpuset_mutex); + + /* prepare for attach */ + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cgroup_taskset_for_each(task, cgrp, tset) { /* * can_attach beforehand should guarantee that this doesn't @@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) &cpuset_attach_nodemask_to); mmput(mm); } + + cs->attach_in_progress--; + + /* + * We may have raced with CPU/memory hotunplug. Trigger hotplug + * propagation if @cs doesn't have any CPU or memory. It will move + * the newly added tasks to the nearest parent which can execute. + */ + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + schedule_cpuset_propagate_hotplug(cs); + + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -1469,12 +1510,13 @@ typedef enum { static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + int retval = -ENODEV; + + /* + * CPU or memory hotunplug may leave @cs w/o any execution + * resources, in which case the hotplug code asynchronously updates + * configuration and transfers all tasks to the nearest ancestor + * which can execute. + * + * As writes to "cpus" or "mems" may restore @cs's execution + * resources, wait for the previously scheduled operations before + * proceeding, so that we don't end up keep removing tasks added + * after execution capability is restored. + * + * Flushing cpuset_hotplug_work is enough to synchronize against + * hotplug hanlding; however, cpuset_attach() may schedule + * propagation work directly. Flush the workqueue too. + */ + flush_work(&cpuset_hotplug_work); + flush_workqueue(cpuset_propagate_hotplug_wq); - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; trialcs = alloc_trial_cpuset(cs); if (!trialcs) { retval = -ENOMEM; - goto out; + goto out_unlock; } switch (cft->private) { @@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); -out: - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1790,15 +1854,12 @@ static struct cftype files[] = { static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cgroup *parent_cg = cont->parent; - struct cgroup *tmp_cg; - struct cpuset *parent, *cs; + struct cpuset *cs; - if (!parent_cg) + if (!cont->parent) return &top_cpuset.css; - parent = cgroup_cs(parent_cg); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); + cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { @@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) return ERR_PTR(-ENOMEM); } - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); + INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; - cs->parent = parent; + return &cs->css; +} + +static int cpuset_css_online(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *parent = parent_cs(cs); + struct cpuset *tmp_cs; + struct cgroup *pos_cg; + + if (!parent) + return 0; + + mutex_lock(&cpuset_mutex); + + set_bit(CS_ONLINE, &cs->flags); + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) - goto skip_clone; + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { - struct cpuset *tmp_cs = cgroup_cs(tmp_cg); - - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) - goto skip_clone; + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_cg, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + goto out_unlock; + } } + rcu_read_unlock(); mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); -skip_clone: - return &cs->css; +out_unlock: + mutex_unlock(&cpuset_mutex); + return 0; +} + +static void cpuset_css_offline(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + + mutex_lock(&cpuset_mutex); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + number_of_cpusets--; + clear_bit(CS_ONLINE, &cs->flags); + + mutex_unlock(&cpuset_mutex); } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .subsys_id = cpuset_subsys_id, .base_cftypes = files, @@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, { struct cgroup *new_cgroup = scan->data; + cgroup_lock(); cgroup_attach_task(new_cgroup, tsk); + cgroup_unlock(); } /** @@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, @@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - - /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ - parent = cs->parent; + parent = parent_cs(cs); while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) - parent = parent->parent; + parent = parent_cs(parent); move_member_tasks_to_cpuset(cs, parent); } -/* - * Helper function to traverse cpusets. - * It can be used to walk the cpuset tree from top to bottom, completing - * one layer before dropping down to the next (thus always processing a - * node before any of its children). +/** + * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * @cs: cpuset in interest + * + * Compare @cs's cpu and mem masks against top_cpuset and if some have gone + * offline, update @cs accordingly. If @cs ends up with no CPU or memory, + * all its tasks are moved to the nearest ancestor with both resources. */ -static struct cpuset *cpuset_next(struct list_head *queue) +static void cpuset_propagate_hotplug_workfn(struct work_struct *work) { - struct cpuset *cp; - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + static cpumask_t off_cpus; + static nodemask_t off_mems, tmp_mems; + struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); + bool is_empty; - if (list_empty(queue)) - return NULL; + mutex_lock(&cpuset_mutex); + + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); + nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - cp = list_first_entry(queue, struct cpuset, stack_list); - list_del(queue->next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, queue); + /* remove offline cpus from @cs */ + if (!cpumask_empty(&off_cpus)) { + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + update_tasks_cpumask(cs, NULL); + } + + /* remove offline mems from @cs */ + if (!nodes_empty(off_mems)) { + tmp_mems = cs->mems_allowed; + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + update_tasks_nodemask(cs, &tmp_mems, NULL); } - return cp; + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * If @cs became empty, move tasks to the nearest ancestor with + * execution resources. This is full cgroup operation which will + * also call back into cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + /* the following may free @cs, should be the last operation */ + css_put(&cs->css); } +/** + * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset + * @cs: cpuset of interest + * + * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and + * memory masks according to top_cpuset. + */ +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) +{ + /* + * Pin @cs. The refcnt will be released when the work item + * finishes executing. + */ + if (!css_tryget(&cs->css)) + return; -/* - * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory - * online/offline) and update the cpusets accordingly. - * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such - * cpuset must be moved to a parent cpuset. + /* + * Queue @cs->hotplug_work. If already pending, lose the css ref. + * cpuset_propagate_hotplug_wq is ordered and propagation will + * happen in the order this function is called. + */ + if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) + css_put(&cs->css); +} + +/** + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. + * This function is called after either CPU or memory configuration has + * changed and updates cpuset accordingly. The top_cpuset is always + * synchronized to cpu_active_mask and N_MEMORY, which is necessary in + * order to make cpusets transparent (of no affect) on systems that are + * actively using CPU hotplug but making no active use of cpusets. * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. + * Non-root cpusets are only affected by offlining. If any CPUs or memory + * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all + * descendants. * - * In the case of memory hot-unplug, it will remove nodes from N_MEMORY - * if all present pages from a node are offlined. + * Note that CPU offlining during suspend is ignored. We don't modify + * cpusets across suspend/resume cycles at all. */ -static void -scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) +static void cpuset_hotplug_workfn(struct work_struct *work) { - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - static nodemask_t oldmems; /* protected by cgroup_mutex */ + static cpumask_t new_cpus, tmp_cpus; + static nodemask_t new_mems, tmp_mems; + bool cpus_updated, mems_updated; + bool cpus_offlined, mems_offlined; - list_add_tail((struct list_head *)&root->stack_list, &queue); + mutex_lock(&cpuset_mutex); - switch (event) { - case CPUSET_CPU_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); + new_mems = node_states[N_MEMORY]; - /* Continue past cpusets with all cpus online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) - continue; + cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); + cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, + &new_cpus); - /* Remove offline cpus from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - mutex_unlock(&callback_mutex); + mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); + mems_offlined = !nodes_empty(tmp_mems); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_cpumask(cp, NULL); - } - break; + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { + mutex_lock(&callback_mutex); + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + mutex_unlock(&callback_mutex); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } - case CPUSET_MEM_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { + tmp_mems = top_cpuset.mems_allowed; + mutex_lock(&callback_mutex); + top_cpuset.mems_allowed = new_mems; + mutex_unlock(&callback_mutex); + update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + } - /* Continue past cpusets with all mems online */ - if (nodes_subset(cp->mems_allowed, - node_states[N_MEMORY])) - continue; + /* if cpus or mems went down, we need to propagate to descendants */ + if (cpus_offlined || mems_offlined) { + struct cpuset *cs; + struct cgroup *pos_cgrp; - oldmems = cp->mems_allowed; + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) + schedule_cpuset_propagate_hotplug(cs); + rcu_read_unlock(); + } - /* Remove offline mems from this cpuset. */ - mutex_lock(&callback_mutex); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&cpuset_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_nodemask(cp, &oldmems, NULL); - } + /* wait for propagations to finish */ + flush_workqueue(cpuset_propagate_hotplug_wq); + + /* rebuild sched domains if cpus_allowed has changed */ + if (cpus_updated) { + struct sched_domain_attr *attr; + cpumask_var_t *doms; + int ndoms; + + mutex_lock(&cpuset_mutex); + ndoms = generate_sched_domains(&doms, &attr); + mutex_unlock(&cpuset_mutex); + + partition_sched_domains(ndoms, doms, attr); } } -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * The only exception to this is suspend/resume, where we don't - * modify cpusets at all. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). - * - * @cpu_online: Indicates whether this is a CPU online event (true) or - * a CPU offline event (false). - */ void cpuset_update_active_cpus(bool cpu_online) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - cgroup_lock(); - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - mutex_unlock(&callback_mutex); - - if (!cpu_online) - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); - - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + * + * We still need to do partition_sched_domains() synchronously; + * otherwise, the scheduler will get confused and put tasks to the + * dead CPU. Fall back to the default single domain. + * cpuset_hotplug_workfn() will rebuild it as necessary. + */ + partition_sched_domains(1, NULL, NULL); + schedule_work(&cpuset_hotplug_work); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - cgroup_lock(); - switch (action) { - case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_MEMORY]; - mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); - break; - case MEM_OFFLINE: - /* - * needn't update top_cpuset.mems_allowed explicitly because - * scan_cpusets_upon_hotplug() will update it. - */ - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); - break; - default: - break; - } - cgroup_unlock(); - + schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } #endif @@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void) hotplug_memory_notifier(cpuset_track_online_nodes, 10); - cpuset_wq = create_singlethread_workqueue("cpuset"); - BUG_ON(!cpuset_wq); + cpuset_propagate_hotplug_wq = + alloc_ordered_workqueue("cpuset_hotplug", 0); + BUG_ON(!cpuset_propagate_hotplug_wq); } /** @@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) */ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) + cs = parent_cs(cs); return cs; } @@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) } /** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - -/** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page * @@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) dentry = task_cs(tsk)->css.cgroup->dentry; spin_lock(&cpuset_buffer_lock); - snprintf(cpuset_name, CPUSET_NAME_LEN, - dentry ? (const char *)dentry->d_name.name : "/"); + + if (!dentry) { + strcpy(cpuset_name, "/"); + } else { + spin_lock(&dentry->d_lock); + strlcpy(cpuset_name, (const char *)dentry->d_name.name, + CPUSET_NAME_LEN); + spin_unlock(&dentry->d_lock); + } + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", @@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *unused_v) @@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; - retval = -EINVAL; - cgroup_lock(); + rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + rcu_read_unlock(); if (retval < 0) - goto out_unlock; + goto out_put_task; seq_puts(m, buf); seq_putc(m, '\n'); -out_unlock: - cgroup_unlock(); +out_put_task: put_task_struct(tsk); out_free: kfree(buf); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9a61738cefc8..c26278fd4851 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -29,6 +29,7 @@ */ #include <linux/pid_namespace.h> #include <linux/clocksource.h> +#include <linux/serial_core.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/console.h> diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e064482..38573f35a5ad 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/kgdb.h> #include <linux/kdb.h> +#include <linux/serial_core.h> #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index 5c75791d7269..ccc457e36354 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3691,7 +3691,7 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; diff --git a/kernel/fork.c b/kernel/fork.c index 4133876d8cd2..8f62b2a0f120 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); diff --git a/kernel/futex.c b/kernel/futex.c index 9618b6e9fb36..fbc07a29ec53 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2472,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005fc..f9f44fd4d34d 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -11,6 +11,7 @@ #include <linux/nsproxy.h> #include <linux/futex.h> #include <linux/ptrace.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> @@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr) } } -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, return 0; } -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, - compat_size_t __user *len_ptr) +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; @@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; @@ -172,9 +171,9 @@ err_unlock: return ret; } -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - u32 val3) +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct compat_timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc1..d4da55d1fb65 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4bd4faa6323a..397db02209ed 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; + unsigned int irq = (int)(long)PDE(file_inode(file))->data; cpumask_var_t new_value; int err; diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5d..2436ffcec91f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = { .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource crashk_low_res = { + .name = "Crash kernel low", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; int kexec_should_crash(struct task_struct *p) { @@ -1369,10 +1375,11 @@ static int __init parse_crashkernel_simple(char *cmdline, * That function is the entry point for command line parsing and should be * called from the arch-specific code. */ -int __init parse_crashkernel(char *cmdline, +static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + const char *name) { char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; @@ -1382,16 +1389,16 @@ int __init parse_crashkernel(char *cmdline, *crash_base = 0; /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); + p = strstr(p, name); while (p) { ck_cmdline = p; - p = strstr(p+1, "crashkernel="); + p = strstr(p+1, name); } if (!ck_cmdline) return -EINVAL; - ck_cmdline += 12; /* strlen("crashkernel=") */ + ck_cmdline += strlen(name); /* * if the commandline contains a ':', then that's the extended @@ -1409,6 +1416,23 @@ int __init parse_crashkernel(char *cmdline, return 0; } +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel="); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_low="); +} static void update_vmcoreinfo_note(void) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7981e5b2350d..8a0efac4f99d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low!\n"); + printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", + curr->lockdep_depth, MAX_LOCK_DEPTH); printk("turning off the locking correctness validator.\n"); + + lockdep_print_held_locks(current); + debug_show_all_locks(); dump_stack(); + return 0; } @@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (!debug_locks_off()) @@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 0; if (curr->lockdep_depth <= 0) - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); return 1; } @@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: lockdep_init_map(lock, name, key, 0); @@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: if (hlock->instance == lock) diff --git a/kernel/module.c b/kernel/module.c index eab08274ec9b..0925c9a71975 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod) return -ENOENT; } -static inline void add_taint_module(struct module *mod, unsigned flag) +static inline void add_taint_module(struct module *mod, unsigned flag, + enum lockdep_ok lockdep_ok) { - add_taint(flag); + add_taint(flag, lockdep_ok); mod->taints |= (1U << flag); } @@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - add_taint(TAINT_FORCED_RMMOD); + add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); return ret; } #else @@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason) if (!test_taint(TAINT_FORCED_MODULE)) printk(KERN_WARNING "%s: %s: kernel tainted.\n", mod->name, reason); - add_taint_module(mod, TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else return -ENOEXEC; @@ -2147,7 +2148,8 @@ static void set_license(struct module *mod, const char *license) if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); } } @@ -2539,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) if (err) goto out; - err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); + err = vfs_getattr(&file->f_path, &stat); if (err) goto out; @@ -2700,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) } if (!get_modinfo(info, "intree")) - add_taint_module(mod, TAINT_OOT_MODULE); + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP); + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); @@ -2869,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod) * using GPL-only symbols it needs. */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); /* lve claims to be GPL but upstream won't provide source */ if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) @@ -3141,12 +3145,72 @@ static int may_init_module(void) return 0; } +/* + * We try to place it in the list now to make sure it's unique before + * we dedicate too many resources. In particular, temporary percpu + * memory exhaustion. + */ +static int add_unformed_module(struct module *mod) +{ + int err; + struct module *old; + + mod->state = MODULE_STATE_UNFORMED; + +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto out_unlocked; + goto again; + } + err = -EEXIST; + goto out; + } + list_add_rcu(&mod->list, &modules); + err = 0; + +out: + mutex_unlock(&module_mutex); +out_unlocked: + return err; +} + +static int complete_formation(struct module *mod, struct load_info *info) +{ + int err; + + mutex_lock(&module_mutex); + + /* Find duplicate symbols (must be called under lock). */ + err = verify_export_symbols(mod); + if (err < 0) + goto out; + + /* This relies on module_mutex for list integrity. */ + module_bug_finalize(info->hdr, info->sechdrs, mod); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + +out: + mutex_unlock(&module_mutex); + return err; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static int load_module(struct load_info *info, const char __user *uargs, int flags) { - struct module *mod, *old; + struct module *mod; long err; err = module_sig_check(info); @@ -3164,36 +3228,20 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } - /* - * We try to place it in the list now to make sure it's unique - * before we dedicate too many resources. In particular, - * temporary percpu memory exhaustion. - */ - mod->state = MODULE_STATE_UNFORMED; -again: - mutex_lock(&module_mutex); - if ((old = find_module_all(mod->name, true)) != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_module; - goto again; - } - err = -EEXIST; - mutex_unlock(&module_mutex); + /* Reserve our place in the list. */ + err = add_unformed_module(mod); + if (err) goto free_module; - } - list_add_rcu(&mod->list, &modules); - mutex_unlock(&module_mutex); #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; - if (!mod->sig_ok) - add_taint_module(mod, TAINT_FORCED_MODULE); + if (!mod->sig_ok) { + printk_once(KERN_NOTICE + "%s: module verification failed: signature and/or" + " required key missing - tainting kernel\n", + mod->name); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + } #endif /* Now module is in final location, initialize linked lists, etc. */ @@ -3236,21 +3284,11 @@ again: dynamic_debug_setup(info->debug, info->num_debug); - mutex_lock(&module_mutex); - /* Find duplicate symbols (must be called under lock). */ - err = verify_export_symbols(mod); - if (err < 0) + /* Finally it's fully formed, ready to start executing. */ + err = complete_formation(mod, info); + if (err) goto ddebug_cleanup; - /* This relies on module_mutex for list integrity. */ - module_bug_finalize(info->hdr, info->sechdrs, mod); - - /* Mark state as coming so strong_try_module_get() ignores us, - * but kallsyms etc. can see us. */ - mod->state = MODULE_STATE_COMING; - - mutex_unlock(&module_mutex); - /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); @@ -3274,8 +3312,8 @@ again: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); - ddebug_cleanup: mutex_unlock(&module_mutex); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 78e2ecb20165..afc0456f227a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, - task_cred_xxx(tsk, user_ns), tsk->fs); + new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = PROC_I(file->f_dentry->d_inode); + ei = PROC_I(file_inode(file)); ops = ei->ns_ops; if (nstype && (ops->type != nstype)) goto out; diff --git a/kernel/panic.c b/kernel/panic.c index e1b2822fff97..7c57cc9eee2c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -259,26 +259,19 @@ unsigned long get_taint(void) return tainted_mask; } -void add_taint(unsigned flag) +/** + * add_taint: add a taint flag if not already set. + * @flag: one of the TAINT_* constants. + * @lockdep_ok: whether lock debugging is still OK. + * + * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for + * some notewortht-but-not-corrupting cases, it can be set to true. + */ +void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { - /* - * Can't trust the integrity of the kernel anymore. - * We don't call directly debug_locks_off() because the issue - * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging/out-of-tree - * development and post-warning case. - */ - switch (flag) { - case TAINT_CRAP: - case TAINT_OOT_MODULE: - case TAINT_WARN: - case TAINT_FIRMWARE_WORKAROUND: - break; - - default: - if (__debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); - } + if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) + printk(KERN_WARNING + "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller, print_modules(); dump_stack(); print_oops_end_marker(); - add_taint(taint); + /* Just a warning, don't kill lockdep. */ + add_taint(taint, LOCKDEP_STILL_OK); } void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 10349d5f2ec3..7edfe4b901e7 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -639,6 +639,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + rcu_read_lock(); timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index ca304046d9e2..c6422ffeda9a 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend); void queue_up_suspend_work(void) { - if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + if (autosleep_state > PM_SUSPEND_ON) queue_work(autosleep_wq, &suspend_work); } diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c16f9167de1..d77663bfedeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; + suspend_state_t state = PM_SUSPEND_MIN; const char * const *s; #endif char *p; @@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -576,6 +600,9 @@ static struct attribute * g[] = { &pm_print_times_attr.attr, #endif #endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, +#endif NULL, }; diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b60c6f..98088e0e71e8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -21,7 +21,7 @@ /* * Timeout for stopping processes */ -#define TIMEOUT (20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { @@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only) do_gettimeofday(&start); - end_time = jiffies + TIMEOUT; + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9322ff7eaad6..587dddeebf15 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, "%s called for unknown object.", __func__)) return; - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..d4feda084a3a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,12 +30,38 @@ #include "power.h" const char *const pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = "freeze", [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", }; static const struct platform_suspend_ops *suspend_ops; +static bool need_suspend_ops(suspend_state_t state) +{ + return !!(state > PM_SUSPEND_FREEZE); +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +static void freeze_begin(void) +{ + suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ + wait_event(suspend_freeze_wait_head, suspend_freeze_wake); +} + +void freeze_wake(void) +{ + suspend_freeze_wake = true; + wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { + if (state == PM_SUSPEND_FREEZE) + return true; /* - * All states need lowlevel support and need to be valid to the lowlevel + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel + * support and need to be valid to the lowlevel * implementation, no valid callback implies that none are valid. */ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); @@ -89,11 +118,11 @@ static int suspend_test(int level) * hibernation). Run suspend notifiers, allocate the "suspend" console and * freeze processes. */ -static int suspend_prepare(void) +static int suspend_prepare(suspend_state_t state) { int error; - if (!suspend_ops || !suspend_ops->enter) + if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) return -EPERM; pm_prepare_console(); @@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (suspend_ops->prepare) { + if (need_suspend_ops(state) && suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) goto Platform_finish; @@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - if (suspend_ops->prepare_late) { + if (need_suspend_ops(state) && suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) goto Platform_wake; } + /* + * PM_SUSPEND_FREEZE equals + * frozen processes + suspended devices + idle processors. + * Thus we should invoke freeze_enter() soon after + * all the devices are suspended. + */ + if (state == PM_SUSPEND_FREEZE) { + freeze_enter(); + goto Platform_wake; + } + if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - if (suspend_ops->wake) + if (need_suspend_ops(state) && suspend_ops->wake) suspend_ops->wake(); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (suspend_ops->finish) + if (need_suspend_ops(state) && suspend_ops->finish) suspend_ops->finish(); return error; @@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (!suspend_ops) + if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; trace_machine_suspend(state); - if (suspend_ops->begin) { + if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) goto Close; @@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup + } while (!error && !wakeup && need_suspend_ops(state) && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: @@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state) ftrace_start(); resume_console(); Close: - if (suspend_ops->end) + if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: - if (suspend_ops->recover) + if (need_suspend_ops(state) && suspend_ops->recover) suspend_ops->recover(); goto Resume_devices; } @@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; + if (state == PM_SUSPEND_FREEZE) + freeze_begin(); + printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); + error = suspend_prepare(state); if (error) goto Unlock; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 25596e450ac7..9b2a1d58558d 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) rtc_set_alarm(rtc, &alm); } -static int __init has_wakealarm(struct device *dev, void *name_ptr) +static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); @@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(const char **)name_ptr = dev_name(dev); return 1; } @@ -159,8 +158,8 @@ static int __init test_suspend(void) static char warn_no_rtc[] __initdata = KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - char *pony = NULL; struct rtc_device *rtc = NULL; + struct device *dev; /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) @@ -171,9 +170,9 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) + rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); goto done; diff --git a/kernel/printk.c b/kernel/printk.c index f24633afa46a..0b31715f335a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -88,6 +88,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_lock_dep_map = { + .name = "console_lock" +}; +#endif + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1919,6 +1925,7 @@ void console_lock(void) return; console_locked = 1; console_may_schedule = 1; + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1940,6 +1947,7 @@ int console_trylock(void) } console_locked = 1; console_may_schedule = 0; + mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2102,6 +2110,7 @@ skip: local_irq_restore(flags); } console_locked = 0; + mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) diff --git a/kernel/relay.c b/kernel/relay.c index e8cd2027abbd..01ab081ac53a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_lock(&file_inode(filp)->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&file_inode(filp)->i_mutex); return desc->written; } diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..64de5f8b0c9e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif + sched_offline_group(ag->tg); sched_destroy_group(ag->tg); } @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; + sched_online_group(tg, &root_task_group); + kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 03d7784b7bd2..2b5243176aba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; - /* Look for allowed, online CPU in same node. */ - for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; + /* + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. + */ + if (nid != -1) { + nodemask = cpumask_of_node(nid); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } } for (;;) { @@ -1969,11 +1979,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -1985,23 +1994,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; @@ -2786,7 +2778,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); - add_taint(TAINT_WARN); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* @@ -4364,7 +4356,10 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns true if we indeed boosted the target task. + * Returns: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. */ bool __sched yield_to(struct task_struct *p, bool preempt) { @@ -4378,6 +4373,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt) again: p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } + double_rq_lock(rq, p_rq); while (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); @@ -4385,13 +4389,13 @@ again: } if (!curr->sched_class->yield_to_task) - goto out; + goto out_unlock; if (curr->sched_class != p->sched_class) - goto out; + goto out_unlock; if (task_running(p_rq, p) || p->state) - goto out; + goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { @@ -4404,11 +4408,12 @@ again: resched_task(p_rq->curr); } -out: +out_unlock: double_rq_unlock(rq, p_rq); +out_irq: local_irq_restore(flags); - if (yielded) + if (yielded > 0) schedule(); return yielded; @@ -7161,7 +7166,6 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - unsigned long flags; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -7173,6 +7177,17 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); @@ -7182,12 +7197,6 @@ struct task_group *sched_create_group(struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ @@ -7200,6 +7209,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ unsigned long flags; int i; @@ -7211,9 +7226,6 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7584,6 +7596,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct task_group *parent; + + if (!cgrp->parent) + return 0; + + parent = cgroup_tg(cgrp->parent); + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7591,6 +7616,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) sched_destroy_group(tg); } +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_offline_group(tg); +} + static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -7946,6 +7978,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9857329ed280..ed12cbb135f4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk) { unsigned long long clock; - clock = sched_clock(); + clock = local_clock(); if (clock < tsk->vtime_snap) return 0; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7ae4c4c5420e..75024a673520 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - /* - * May be NULL if the underlying cgroup isn't fully-created yet - */ - if (!tg->css.cgroup) { - group_path[0] = '\0'; - return group_path; - } cgroup_path(tg->css.cgroup, group_path, PATH_MAX); return group_path; } @@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif #define P(x) \ @@ -330,6 +323,7 @@ do { \ print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} - for_each_online_cpu(cpu) - print_cpu(m, cpu); +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); - SEQ_printf(m, "\n"); + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 903ffa9e8872..e036eda1a9c9 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v) if (mask_str == NULL) return -ENOMEM; - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + if (v == (void *)1) { + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + } else { + struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); /* runqueue-specific stats */ seq_printf(seq, @@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v) return 0; } -static int schedstat_open(struct inode *inode, struct file *file) +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset) { - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; + unsigned long n = *offset; - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { + .start = schedstat_start, + .next = schedstat_next, + .stop = schedstat_stop, + .show = show_schedstat, +}; + +static int schedstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &schedstat_sops); } +static int schedstat_release(struct inode *inode, struct file *file) +{ + return 0; +}; + static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = schedstat_release, }; static int __init proc_schedstat_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 7f82adbad480..2a7ae2963185 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2399,6 +2399,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, tracehook_signal_handler(sig, info, ka, regs, stepping); } +void signal_setup_done(int failed, struct ksignal *ksig, int stepping) +{ + if (failed) + force_sigsegv(ksig->sig, current); + else + signal_delivered(ksig->sig, &ksig->info, &ksig->ka, + signal_pt_regs(), stepping); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take @@ -2616,28 +2625,58 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, return 0; } -long do_sigpending(void __user *set, unsigned long sigsetsize) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, + compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { - long error = -EINVAL; - sigset_t pending; +#ifdef __BIG_ENDIAN + sigset_t old_set = current->blocked; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (nset) { + compat_sigset_t new32; + sigset_t new_set; + int error; + if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + return -EFAULT; + + sigset_from_compat(&new_set, &new32); + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + error = sigprocmask(how, &new_set, NULL); + if (error) + return error; + } + if (oset) { + compat_sigset_t old32; + sigset_to_compat(&old32, &old_set); + if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + return -EFAULT; + } + return 0; +#else + return sys_rt_sigprocmask(how, (sigset_t __user *)nset, + (sigset_t __user *)oset, sigsetsize); +#endif +} +#endif +static int do_sigpending(void *set, unsigned long sigsetsize) +{ if (sigsetsize > sizeof(sigset_t)) - goto out; + return -EINVAL; spin_lock_irq(¤t->sighand->siglock); - sigorsets(&pending, ¤t->pending.signal, + sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); spin_unlock_irq(¤t->sighand->siglock); /* Outside the lock because only this thread touches it. */ - sigandsets(&pending, ¤t->blocked, &pending); - - error = -EFAULT; - if (!copy_to_user(set, &pending, sigsetsize)) - error = 0; - -out: - return error; + sigandsets(set, ¤t->blocked, set); + return 0; } /** @@ -2646,10 +2685,35 @@ out: * @set: stores pending signals * @sigsetsize: size of sigset_t type or larger */ -SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { - return do_sigpending(set, sigsetsize); + sigset_t set; + int err = do_sigpending(&set, sigsetsize); + if (!err && copy_to_user(uset, &set, sigsetsize)) + err = -EFAULT; + return err; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, + compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN + sigset_t set; + int err = do_sigpending(&set, sigsetsize); + if (!err) { + compat_sigset_t set32; + sigset_to_compat(&set32, &set); + /* we can get here only if sigsetsize <= sizeof(set) */ + if (copy_to_user(uset, &set32, sigsetsize)) + err = -EFAULT; + } + return err; +#else + return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); +#endif } +#endif #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER @@ -2927,6 +2991,22 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) +{ + /* Not even root can pretend to send signals from the kernel. + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code >= 0 || info->si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); + return -EPERM; + } + info->si_signo = sig; + + /* POSIX.1b doesn't mention process groups. */ + return kill_proc_info(sig, info, pid); +} + /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread @@ -2937,25 +3017,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { siginfo_t info; - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) return -EFAULT; + return do_rt_sigqueueinfo(pid, sig, &info); +} - /* Not even root can pretend to send signals from the kernel. - * Nor can they impersonate a kill()/tgkill(), which adds source info. - */ - if (info.si_code >= 0 || info.si_code == SI_TKILL) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info.si_code < 0); - return -EPERM; - } - info.si_signo = sig; - - /* POSIX.1b doesn't mention process groups. */ - return kill_proc_info(sig, &info, pid); +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, + compat_pid_t, pid, + int, sig, + struct compat_siginfo __user *, uinfo) +{ + siginfo_t info; + int ret = copy_siginfo_from_user32(&info, uinfo); + if (unlikely(ret)) + return ret; + return do_rt_sigqueueinfo(pid, sig, &info); } +#endif -long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -2985,6 +3066,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, + compat_pid_t, tgid, + compat_pid_t, pid, + int, sig, + struct compat_siginfo __user *, uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} +#endif + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; @@ -3030,7 +3126,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } -int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; @@ -3095,12 +3191,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s out: return error; } -#ifdef CONFIG_GENERIC_SIGALTSTACK SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { return do_sigaltstack(uss, uoss, current_user_stack_pointer()); } -#endif int restore_altstack(const stack_t __user *uss) { @@ -3118,7 +3212,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) } #ifdef CONFIG_COMPAT -#ifdef CONFIG_GENERIC_SIGALTSTACK COMPAT_SYSCALL_DEFINE2(sigaltstack, const compat_stack_t __user *, uss_ptr, compat_stack_t __user *, uoss_ptr) @@ -3168,7 +3261,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) __put_user(t->sas_ss_size, &uss->ss_size); } #endif -#endif #ifdef __ARCH_WANT_SYS_SIGPENDING @@ -3178,7 +3270,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { - return do_sigpending(set, sizeof(*set)); + return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } #endif @@ -3234,7 +3326,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ -#ifdef __ARCH_WANT_SYS_RT_SIGACTION +#ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent @@ -3268,7 +3360,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, out: return ret; } -#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct compat_sigaction __user *, act, + struct compat_sigaction __user *, oact, + compat_size_t, sigsetsize) +{ + struct k_sigaction new_ka, old_ka; + compat_sigset_t mask; +#ifdef __ARCH_HAS_SA_RESTORER + compat_uptr_t restorer; +#endif + int ret; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + + if (act) { + compat_uptr_t handler; + ret = get_user(handler, &act->sa_handler); + new_ka.sa.sa_handler = compat_ptr(handler); +#ifdef __ARCH_HAS_SA_RESTORER + ret |= get_user(restorer, &act->sa_restorer); + new_ka.sa.sa_restorer = compat_ptr(restorer); +#endif + ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + if (ret) + return -EFAULT; + sigset_from_compat(&new_ka.sa.sa_mask, &mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + if (!ret && oact) { + sigset_to_compat(&mask, &old_ka.sa.sa_mask); + ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler); + ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +#ifdef __ARCH_HAS_SA_RESTORER + ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer); +#endif + } + return ret; +} +#endif +#endif /* !CONFIG_ODD_RT_SIGACTION */ + +#ifdef CONFIG_OLD_SIGACTION +SYSCALL_DEFINE3(sigaction, int, sig, + const struct old_sigaction __user *, act, + struct old_sigaction __user *, oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; +#ifdef __ARCH_HAS_KA_RESTORER + new_ka.ka_restorer = NULL; +#endif + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + + return ret; +} +#endif +#ifdef CONFIG_COMPAT_OLD_SIGACTION +COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, + const struct compat_old_sigaction __user *, act, + struct compat_old_sigaction __user *, oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + compat_old_sigset_t mask; + compat_uptr_t handler, restorer; + + if (act) { + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(handler, &act->sa_handler) || + __get_user(restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; + +#ifdef __ARCH_HAS_KA_RESTORER + new_ka.ka_restorer = NULL; +#endif + new_ka.sa.sa_handler = compat_ptr(handler); + new_ka.sa.sa_restorer = compat_ptr(restorer); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + return ret; +} +#endif #ifdef __ARCH_WANT_SYS_SGETMASK @@ -3336,7 +3553,6 @@ int sigsuspend(sigset_t *set) return -ERESTARTNOHAND; } -#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received @@ -3355,7 +3571,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; return sigsuspend(&newset); } -#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN + sigset_t newset; + compat_sigset_t newset32; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&newset, &newset32); + return sigsuspend(&newset); +#else + /* on little-endian bitmaps don't care about granularity */ + return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); +#endif +} +#endif + +#ifdef CONFIG_OLD_SIGSUSPEND +SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) +{ + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); +} +#endif +#ifdef CONFIG_OLD_SIGSUSPEND3 +SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) +{ + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); +} +#endif __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) { diff --git a/kernel/smp.c b/kernel/smp.c index 69f38bd98b42..8e451f3ff51b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,22 +16,12 @@ #include "smpboot.h" #ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static struct { - struct list_head queue; - raw_spinlock_t lock; -} call_function __cacheline_aligned_in_smp = - { - .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), - }; - enum { CSD_FLAG_LOCK = 0x01, }; struct call_function_data { - struct call_single_data csd; - atomic_t refs; + struct call_single_data __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); + cfd->csd = alloc_percpu(struct call_single_data); + if (!cfd->csd) { + free_cpumask_var(cfd->cpumask); + return notifier_from_errno(-ENOMEM); + } break; #ifdef CONFIG_HOTPLUG_CPU @@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); + free_percpu(cfd->csd); break; #endif }; @@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) } /* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ - struct call_function_data *data; - int cpu = smp_processor_id(); - - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - WARN_ON_ONCE(!cpu_online(cpu)); - - /* - * Ensure entry is visible on call_function_queue after we have - * entered the IPI. See comment in smp_call_function_many. - * If we don't have this, then we may miss an entry on the list - * and never get another IPI to process it. - */ - smp_mb(); - - /* - * It's ok to use list_for_each_rcu() here even though we may - * delete 'pos', since list_del_rcu() doesn't clear ->next - */ - list_for_each_entry_rcu(data, &call_function.queue, csd.list) { - int refs; - smp_call_func_t func; - - /* - * Since we walk the list without any locks, we might - * see an entry that was completed, removed from the - * list and is in the process of being reused. - * - * We must check that the cpu is in the cpumask before - * checking the refs, and both must be set before - * executing the callback on this cpu. - */ - - if (!cpumask_test_cpu(cpu, data->cpumask)) - continue; - - smp_rmb(); - - if (atomic_read(&data->refs) == 0) - continue; - - func = data->csd.func; /* save for later warn */ - func(data->csd.info); - - /* - * If the cpu mask is not still set then func enabled - * interrupts (BUG), and this cpu took another smp call - * function interrupt and executed func(info) twice - * on this cpu. That nested execution decremented refs. - */ - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pf enabled interrupts and double executed\n", func); - continue; - } - - refs = atomic_dec_return(&data->refs); - WARN_ON(refs < 0); - - if (refs) - continue; - - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - - csd_unlock(&data->csd); - } - -} - -/* * Invoked by arch to handle an IPI for call function single. Must be * called from the arch with interrupts disabled. */ @@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { struct call_function_data *data; - unsigned long flags; - int refs, cpu, next_cpu, this_cpu = smp_processor_id(); + int cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask, } data = &__get_cpu_var(cfd_data); - csd_lock(&data->csd); - - /* This BUG_ON verifies our reuse assertions and can be removed */ - BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); - - /* - * The global call function queue list add and delete are protected - * by a lock, but the list is traversed without any lock, relying - * on the rcu list add and delete to allow safe concurrent traversal. - * We reuse the call function data without waiting for any grace - * period after some other cpu removes it from the global queue. - * This means a cpu might find our data block as it is being - * filled out. - * - * We hold off the interrupt handler on the other cpu by - * ordering our writes to the cpu mask vs our setting of the - * refs counter. We assert only the cpu owning the data block - * will set a bit in cpumask, and each bit will only be cleared - * by the subject cpu. Each cpu must first find its bit is - * set and then check that refs is set indicating the element is - * ready to be processed, otherwise it must skip the entry. - * - * On the previous iteration refs was set to 0 by another cpu. - * To avoid the use of transitivity, set the counter to 0 here - * so the wmb will pair with the rmb in the interrupt handler. - */ - atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ - - data->csd.func = func; - data->csd.info = info; - /* Ensure 0 refs is visible before mask. Also orders func and info */ - smp_wmb(); - - /* We rely on the "and" being processed before the store */ cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - refs = cpumask_weight(data->cpumask); /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!refs)) { - csd_unlock(&data->csd); + if (unlikely(!cpumask_weight(data->cpumask))) return; - } /* * After we put an entry into the list, data->cpumask @@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask, * a SMP function call, so data->cpumask will be zero. */ cpumask_copy(data->cpumask_ipi, data->cpumask); - raw_spin_lock_irqsave(&call_function.lock, flags); - /* - * Place entry at the _HEAD_ of the list, so that any cpu still - * observing the entry in generic_smp_call_function_interrupt() - * will not miss any other list entries: - */ - list_add_rcu(&data->csd.list, &call_function.queue); - /* - * We rely on the wmb() in list_add_rcu to complete our writes - * to the cpumask before this write to refs, which indicates - * data is on the list and is ready to be processed. - */ - atomic_set(&data->refs, refs); - raw_spin_unlock_irqrestore(&call_function.lock, flags); - /* - * Make the list addition visible before sending the ipi. - * (IPIs must obey or appear to obey normal Linux cache - * coherency rules -- see comment in generic_exec_single). - */ - smp_mb(); + for_each_cpu(cpu, data->cpumask) { + struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); + struct call_single_queue *dst = + &per_cpu(call_single_queue, cpu); + unsigned long flags; + + csd_lock(csd); + csd->func = func; + csd->info = info; + + raw_spin_lock_irqsave(&dst->lock, flags); + list_add_tail(&csd->list, &dst->list); + raw_spin_unlock_irqrestore(&dst->lock, flags); + } /* Send a message to all CPUs in the map */ arch_send_call_function_ipi_mask(data->cpumask_ipi); - /* Optionally wait for the CPUs to complete */ - if (wait) - csd_lock_wait(&data->csd); + if (wait) { + for_each_cpu(cpu, data->cpumask) { + struct call_single_data *csd = + per_cpu_ptr(data->csd, cpu); + csd_lock_wait(csd); + } + } } EXPORT_SYMBOL(smp_call_function_many); diff --git a/kernel/softirq.c b/kernel/softirq.c index f5cc25f147a6..b4d252fd195b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. + * We restart softirq processing for at most 2 ms, + * and if need_resched() is not set. * - * This number has been established via experimentation. + * These limits have been established via experimentation. * The two things to balance is latency against fairness - * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) asmlinkage void __do_softirq(void) { struct softirq_action *h; __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; + unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; @@ -264,11 +264,12 @@ restart: local_irq_disable(); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending) { + if (time_before(jiffies, end) && !need_resched()) + goto restart; - if (pending) wakeup_softirqd(); + } lockdep_softirq_exit(); diff --git a/kernel/sys.c b/kernel/sys.c index 265b37690421..e10566bee399 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/binfmts.h> #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ @@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex); SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); char buffer[256]; int ret = 0; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; /* For safety, we require "magic" arguments. */ @@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, * pid_namespace, the command is handled by reboot_pid_ns() which will * call do_exit(). */ - ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + ret = reboot_pid_ns(pid_ns, cmd); if (ret) return ret; @@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; - struct dentry *dentry; + struct inode *inode; int err; exe = fdget(fd); if (!exe.file) return -EBADF; - dentry = exe.file->f_path.dentry; + inode = file_inode(exe.file); /* * Because the original mm->exe_file points to executable file, make @@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(dentry->d_inode->i_mode) || + if (!S_ISREG(inode->i_mode) || exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - err = inode_permission(dentry->d_inode, MAY_EXEC); + err = inode_permission(inode, MAY_EXEC); if (err) goto exit; @@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = 0; switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - me->pdeath_signal = arg2; - break; - case PR_GET_PDEATHSIG: - error = put_user(me->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; break; - case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { - error = -EINVAL; - break; - } - set_dumpable(me->mm, arg2); + } + me->pdeath_signal = arg2; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { + error = -EINVAL; break; + } + set_dumpable(me->mm, arg2); + break; - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(me, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(me, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(me, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(me, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(me, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(me, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; - break; - case PR_SET_TIMING: - if (arg2 != PR_TIMING_STATISTICAL) - error = -EINVAL; - break; - case PR_SET_NAME: - comm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(comm, (char __user *)arg2, - sizeof(me->comm) - 1) < 0) - return -EFAULT; - set_task_comm(me, comm); - proc_comm_connector(me); - break; - case PR_GET_NAME: - get_task_comm(comm, me); - if (copy_to_user((char __user *)arg2, comm, - sizeof(comm))) - return -EFAULT; - break; - case PR_GET_ENDIAN: - error = GET_ENDIAN(me, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(me, arg2); - break; - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2, (char __user *)arg3); - break; - case PR_GET_TSC: - error = GET_TSC_CTL(arg2); - break; - case PR_SET_TSC: - error = SET_TSC_CTL(arg2); - break; - case PR_TASK_PERF_EVENTS_DISABLE: - error = perf_event_task_disable(); - break; - case PR_TASK_PERF_EVENTS_ENABLE: - error = perf_event_task_enable(); - break; - case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; - break; - case PR_SET_TIMERSLACK: - if (arg2 <= 0) - current->timer_slack_ns = + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + break; + case PR_SET_NAME: + comm[sizeof(me->comm) - 1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + proc_comm_connector(me); + break; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) + return -EFAULT; + break; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2, (char __user *)arg3); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); + break; + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = current->default_timer_slack_ns; - else - current->timer_slack_ns = arg2; - break; - case PR_MCE_KILL: - if (arg4 | arg5) - return -EINVAL; - switch (arg2) { - case PR_MCE_KILL_CLEAR: - if (arg3 != 0) - return -EINVAL; - current->flags &= ~PF_MCE_PROCESS; - break; - case PR_MCE_KILL_SET: - current->flags |= PF_MCE_PROCESS; - if (arg3 == PR_MCE_KILL_EARLY) - current->flags |= PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_LATE) - current->flags &= ~PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_DEFAULT) - current->flags &= - ~(PF_MCE_EARLY|PF_MCE_PROCESS); - else - return -EINVAL; - break; - default: + else + current->timer_slack_ns = arg2; + break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) return -EINVAL; - } + current->flags &= ~PF_MCE_PROCESS; break; - case PR_MCE_KILL_GET: - if (arg2 | arg3 | arg4 | arg5) - return -EINVAL; - if (current->flags & PF_MCE_PROCESS) - error = (current->flags & PF_MCE_EARLY) ? - PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); else - error = PR_MCE_KILL_DEFAULT; - break; - case PR_SET_MM: - error = prctl_set_mm(arg2, arg3, arg4, arg5); - break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; - case PR_SET_CHILD_SUBREAPER: - me->signal->is_child_subreaper = !!arg2; - break; - case PR_GET_CHILD_SUBREAPER: - error = put_user(me->signal->is_child_subreaper, - (int __user *) arg2); - break; - case PR_SET_NO_NEW_PRIVS: - if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - - current->no_new_privs = 1; break; - case PR_GET_NO_NEW_PRIVS: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - return current->no_new_privs ? 1 : 0; default: - error = -EINVAL; - break; + return -EINVAL; + } + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *)arg2); + break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) + return -EINVAL; + + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; + default: + error = -EINVAL; + break; } return error; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4fc9be955c71..d8df00e69c14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -105,7 +105,6 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; -extern int min_free_kbytes; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; @@ -162,10 +161,13 @@ extern int unaligned_enabled; #endif #ifdef CONFIG_IA64 -extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN +extern int no_unaligned_warning; +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -919,7 +921,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_doulongvec_minmax, }, #endif -#ifdef CONFIG_IA64 +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, @@ -927,6 +929,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_IA64 { .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, @@ -2014,7 +2018,7 @@ static int proc_taint(struct ctl_table *table, int write, int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) - add_taint(i); + add_taint(i, LOCKDEP_STILL_OK); } } diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 5a6384450501..b25115e8c7f3 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_ABC, "tcp_abc" }, { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, @@ -971,7 +970,6 @@ out: static ssize_t bin_intvec(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t copied = 0; char *buffer; ssize_t result; @@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file, if (oldval && oldlen) { unsigned __user *vec = oldval; size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buffer, BUFSZ - 1); if (result < 0) goto out_kfree; @@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file, if (newval && newlen) { unsigned __user *vec = newval; size_t length = newlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; @@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file, str += snprintf(str, end - str, "%lu\t", value); } - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); + result = kernel_write(file, buffer, str - buffer, 0); if (result < 0) goto out_kfree; } @@ -1049,7 +1041,6 @@ out: static ssize_t bin_ulongvec(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t copied = 0; char *buffer; ssize_t result; @@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file, if (oldval && oldlen) { unsigned long __user *vec = oldval; size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buffer, BUFSZ - 1); if (result < 0) goto out_kfree; @@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file, if (newval && newlen) { unsigned long __user *vec = newval; size_t length = newlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; @@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file, str += snprintf(str, end - str, "%lu\t", value); } - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); + result = kernel_write(file, buffer, str - buffer, 0); if (result < 0) goto out_kfree; } @@ -1127,19 +1112,15 @@ out: static ssize_t bin_uuid(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t result, copied = 0; /* Only supports reads */ if (oldval && oldlen) { - loff_t pos = 0; char buf[40], *str = buf; unsigned char uuid[16]; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) goto out; @@ -1175,18 +1156,14 @@ out: static ssize_t bin_dn_node_address(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t result, copied = 0; if (oldval && oldlen) { - loff_t pos = 0; char buf[15], *nodep; unsigned long area, node; __le16 dnaddr; - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) goto out; @@ -1215,7 +1192,6 @@ static ssize_t bin_dn_node_address(struct file *file, } if (newval && newlen) { - loff_t pos = 0; __le16 dnaddr; char buf[15]; int len; @@ -1232,9 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file, le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); - set_fs(KERNEL_DS); - result = vfs_write(file, buf, len, &pos); - set_fs(old_fs); + result = kernel_write(file, buf, len, 0); if (result < 0) goto out; } diff --git a/kernel/time.c b/kernel/time.c index c2a27dd93142..f8342a41efa6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -240,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time); * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ -inline unsigned int jiffies_to_msecs(const unsigned long j) +unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; @@ -256,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); -inline unsigned int jiffies_to_usecs(const unsigned long j) +unsigned int jiffies_to_usecs(const unsigned long j) { #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 30b6de0d977c..c6d6400ee137 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev, clockevents_config(dev, freq); clockevents_register_device(dev); } +EXPORT_SYMBOL_GPL(clockevents_config_and_register); /** * clockevents_update_freq - Update frequency and reprogram a clock event device. diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b10a42bb0165..072bb066bb7d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -23,7 +23,7 @@ * NTP timekeeping variables: */ -DEFINE_SPINLOCK(ntp_lock); +DEFINE_RAW_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ @@ -348,7 +348,7 @@ void ntp_clear(void) { unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; @@ -362,7 +362,7 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); } @@ -372,9 +372,9 @@ u64 ntp_tick_length(void) unsigned long flags; s64 ret; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); ret = tick_length; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return ret; } @@ -395,7 +395,7 @@ int second_overflow(unsigned long secs) int leap = 0; unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -479,7 +479,7 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return leap; } @@ -672,7 +672,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - spin_lock_irq(&ntp_lock); + raw_spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -714,7 +714,7 @@ int do_adjtimex(struct timex *txc) /* fill PPS status fields */ pps_fill_timex(txc); - spin_unlock_irq(&ntp_lock); + raw_spin_unlock_irq(&ntp_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; @@ -912,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) pts_norm = pps_normalize_ts(*phase_ts); - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -925,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -940,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -957,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); } EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e35515a875e..9a0bc98fbe1d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 (*arch_gettimeoffset)(void); + +u32 get_arch_timeoffset(void) +{ + if (likely(arch_gettimeoffset)) + return arch_gettimeoffset(); + return 0; +} +#else +static inline u32 get_arch_timeoffset(void) { return 0; } +#endif + static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; @@ -154,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) nsec = cycle_delta * tk->mult + tk->xtime_nsec; nsec >>= tk->shift; - /* If arch requires, add in gettimeoffset() */ - return nsec + arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) @@ -174,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - /* If arch requires, add in gettimeoffset() */ - return nsec + arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); @@ -257,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; - /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; + /* If arch requires, add in get_arch_timeoffset() */ + tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; tk_normalize_xtime(tk); diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc new file mode 100644 index 000000000000..511bdf2cafda --- /dev/null +++ b/kernel/timeconst.bc @@ -0,0 +1,108 @@ +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include <linux/param.h>\n" + print "#include <linux/types.h>\n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +timeconst(hz) diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl deleted file mode 100644 index 3f42652a6a37..000000000000 --- a/kernel/timeconst.pl +++ /dev/null @@ -1,376 +0,0 @@ -#!/usr/bin/perl -# ----------------------------------------------------------------------- -# -# Copyright 2007-2008 rPath, Inc. - All Rights Reserved -# -# This file is part of the Linux kernel, and is made available under -# the terms of the GNU General Public License version 2 or (at your -# option) any later version; incorporated herein by reference. -# -# ----------------------------------------------------------------------- -# - -# -# Usage: timeconst.pl HZ > timeconst.h -# - -# Precomputed values for systems without Math::BigInt -# Generated by: -# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 -%canned_values = ( - 24 => [ - '0xa6aaaaab','0x2aaaaaa',26, - 125,3, - '0xc49ba5e4','0x1fbe76c8b4',37, - 3,125, - '0xa2c2aaab','0xaaaa',16, - 125000,3, - '0xc9539b89','0x7fffbce4217d',47, - 3,125000, - ], 32 => [ - '0xfa000000','0x6000000',27, - 125,4, - '0x83126e98','0xfdf3b645a',36, - 4,125, - '0xf4240000','0x0',17, - 31250,1, - '0x8637bd06','0x3fff79c842fa',46, - 1,31250, - ], 48 => [ - '0xa6aaaaab','0x6aaaaaa',27, - 125,6, - '0xc49ba5e4','0xfdf3b645a',36, - 6,125, - '0xa2c2aaab','0x15555',17, - 62500,3, - '0xc9539b89','0x3fffbce4217d',46, - 3,62500, - ], 64 => [ - '0xfa000000','0xe000000',28, - 125,8, - '0x83126e98','0x7ef9db22d',35, - 8,125, - '0xf4240000','0x0',18, - 15625,1, - '0x8637bd06','0x1fff79c842fa',45, - 1,15625, - ], 100 => [ - '0xa0000000','0x0',28, - 10,1, - '0xcccccccd','0x733333333',35, - 1,10, - '0x9c400000','0x0',18, - 10000,1, - '0xd1b71759','0x1fff2e48e8a7',45, - 1,10000, - ], 122 => [ - '0x8325c53f','0xfbcda3a',28, - 500,61, - '0xf9db22d1','0x7fbe76c8b',35, - 61,500, - '0x8012e2a0','0x3ef36',18, - 500000,61, - '0xffda4053','0x1ffffbce4217',45, - 61,500000, - ], 128 => [ - '0xfa000000','0x1e000000',29, - 125,16, - '0x83126e98','0x3f7ced916',34, - 16,125, - '0xf4240000','0x40000',19, - 15625,2, - '0x8637bd06','0xfffbce4217d',44, - 2,15625, - ], 200 => [ - '0xa0000000','0x0',29, - 5,1, - '0xcccccccd','0x333333333',34, - 1,5, - '0x9c400000','0x0',19, - 5000,1, - '0xd1b71759','0xfff2e48e8a7',44, - 1,5000, - ], 250 => [ - '0x80000000','0x0',29, - 4,1, - '0x80000000','0x180000000',33, - 1,4, - '0xfa000000','0x0',20, - 4000,1, - '0x83126e98','0x7ff7ced9168',43, - 1,4000, - ], 256 => [ - '0xfa000000','0x3e000000',30, - 125,32, - '0x83126e98','0x1fbe76c8b',33, - 32,125, - '0xf4240000','0xc0000',20, - 15625,4, - '0x8637bd06','0x7ffde7210be',43, - 4,15625, - ], 300 => [ - '0xd5555556','0x2aaaaaaa',30, - 10,3, - '0x9999999a','0x1cccccccc',33, - 3,10, - '0xd0555556','0xaaaaa',20, - 10000,3, - '0x9d495183','0x7ffcb923a29',43, - 3,10000, - ], 512 => [ - '0xfa000000','0x7e000000',31, - 125,64, - '0x83126e98','0xfdf3b645',32, - 64,125, - '0xf4240000','0x1c0000',21, - 15625,8, - '0x8637bd06','0x3ffef39085f',42, - 8,15625, - ], 1000 => [ - '0x80000000','0x0',31, - 1,1, - '0x80000000','0x0',31, - 1,1, - '0xfa000000','0x0',22, - 1000,1, - '0x83126e98','0x1ff7ced9168',41, - 1,1000, - ], 1024 => [ - '0xfa000000','0xfe000000',32, - 125,128, - '0x83126e98','0x7ef9db22',31, - 128,125, - '0xf4240000','0x3c0000',22, - 15625,16, - '0x8637bd06','0x1fff79c842f',41, - 16,15625, - ], 1200 => [ - '0xd5555556','0xd5555555',32, - 5,6, - '0x9999999a','0x66666666',31, - 6,5, - '0xd0555556','0x2aaaaa',22, - 2500,3, - '0x9d495183','0x1ffcb923a29',41, - 3,2500, - ] -); - -$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; - -sub bint($) -{ - my($x) = @_; - return Math::BigInt->new($x); -} - -# -# Constants for division by reciprocal multiplication. -# (bits, numerator, denominator) -# -sub fmul($$$) -{ - my ($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - return scalar (($n << $b)+$d-bint(1))/$d; -} - -sub fadj($$$) -{ - my($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - $d = $d/bgcd($n, $d); - return scalar (($d-bint(1)) << $b)/$d; -} - -sub fmuls($$$) { - my($b,$n,$d) = @_; - my($s,$m); - my($thres) = bint(1) << ($b-1); - - $n = bint($n); - $d = bint($d); - - for ($s = 0; 1; $s++) { - $m = fmul($s,$n,$d); - return $s if ($m >= $thres); - } - return 0; -} - -# Generate a hex value if the result fits in 64 bits; -# otherwise skip. -sub bignum_hex($) { - my($x) = @_; - my $s = $x->as_hex(); - - return (length($s) > 18) ? undef : $s; -} - -# Provides mul, adj, and shr factors for a specific -# (bit, time, hz) combination -sub muladj($$$) { - my($b, $t, $hz) = @_; - my $s = fmuls($b, $t, $hz); - my $m = fmul($s, $t, $hz); - my $a = fadj($s, $t, $hz); - return (bignum_hex($m), bignum_hex($a), $s); -} - -# Provides numerator, denominator values -sub numden($$) { - my($n, $d) = @_; - my $g = bgcd($n, $d); - return ($n/$g, $d/$g); -} - -# All values for a specific (time, hz) combo -sub conversions($$) { - my ($t, $hz) = @_; - my @val = (); - - # HZ_TO_xx - push(@val, muladj(32, $t, $hz)); - push(@val, numden($t, $hz)); - - # xx_TO_HZ - push(@val, muladj(32, $hz, $t)); - push(@val, numden($hz, $t)); - - return @val; -} - -sub compute_values($) { - my($hz) = @_; - my @val = (); - my $s, $m, $a, $g; - - if (!$has_bigint) { - die "$0: HZ == $hz not canned and ". - "Math::BigInt not available\n"; - } - - # MSEC conversions - push(@val, conversions(1000, $hz)); - - # USEC conversions - push(@val, conversions(1000000, $hz)); - - return @val; -} - -sub outputval($$) -{ - my($name, $val) = @_; - my $csuf; - - if (defined($val)) { - if ($name !~ /SHR/) { - $val = "U64_C($val)"; - } - printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; - } -} - -sub output($@) -{ - my($hz, @val) = @_; - my $pfx, $bit, $suf, $s, $m, $a; - - print "/* Automatically generated by kernel/timeconst.pl */\n"; - print "/* Conversion constants for HZ == $hz */\n"; - print "\n"; - print "#ifndef KERNEL_TIMECONST_H\n"; - print "#define KERNEL_TIMECONST_H\n"; - print "\n"; - - print "#include <linux/param.h>\n"; - print "#include <linux/types.h>\n"; - - print "\n"; - print "#if HZ != $hz\n"; - print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; - print "#endif\n"; - print "\n"; - - foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', - 'HZ_TO_USEC','USEC_TO_HZ') { - foreach $bit (32) { - foreach $suf ('MUL', 'ADJ', 'SHR') { - outputval("${pfx}_$suf$bit", shift(@val)); - } - } - foreach $suf ('NUM', 'DEN') { - outputval("${pfx}_$suf", shift(@val)); - } - } - - print "\n"; - print "#endif /* KERNEL_TIMECONST_H */\n"; -} - -# Pretty-print Perl values -sub perlvals(@) { - my $v; - my @l = (); - - foreach $v (@_) { - if (!defined($v)) { - push(@l, 'undef'); - } elsif ($v =~ /^0x/) { - push(@l, "\'".$v."\'"); - } else { - push(@l, $v.''); - } - } - return join(',', @l); -} - -($hz) = @ARGV; - -# Use this to generate the %canned_values structure -if ($hz eq '--can') { - shift(@ARGV); - @hzlist = sort {$a <=> $b} (@ARGV); - - print "# Precomputed values for systems without Math::BigInt\n"; - print "# Generated by:\n"; - print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; - print "\%canned_values = (\n"; - my $pf = "\t"; - foreach $hz (@hzlist) { - my @values = compute_values($hz); - print "$pf$hz => [\n"; - while (scalar(@values)) { - my $bit; - foreach $bit (32) { - my $m = shift(@values); - my $a = shift(@values); - my $s = shift(@values); - print "\t\t", perlvals($m,$a,$s), ",\n"; - } - my $n = shift(@values); - my $d = shift(@values); - print "\t\t", perlvals($n,$d), ",\n"; - } - print "\t]"; - $pf = ', '; - } - print "\n);\n"; -} else { - $hz += 0; # Force to number - if ($hz < 1) { - die "Usage: $0 HZ\n"; - } - - $cv = $canned_values{$hz}; - @val = defined($cv) ? @$cv : compute_values($hz); - output($hz, @val); -} -exit 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 36567564e221..192473b22799 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -81,21 +81,6 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool -config EVENT_POWER_TRACING_DEPRECATED - depends on EVENT_TRACING - bool "Deprecated power event trace API, to be removed" - default y - help - Provides old power event types: - C-state/idle accounting events: - power:power_start - power:power_end - and old cpufreq accounting event: - power:power_frequency - This is for userspace compatibility - and will vanish after 5 kernel iterations, - namely 3.1. - config CONTEXT_SWITCH_TRACER bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ce8c3d68292f..98ca94a41819 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3996,37 +3996,51 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; - switch (val) { - case MODULE_STATE_COMING: + if (val == MODULE_STATE_COMING) ftrace_init_module(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); - break; - case MODULE_STATE_GOING: + return 0; +} + +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_GOING) ftrace_release_mod(mod); - break; - } return 0; } #else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) { return 0; } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, +struct notifier_block ftrace_module_enter_nb = { + .notifier_call = ftrace_module_notify_enter, .priority = INT_MAX, /* Run before anything that can use kprobes */ }; +struct notifier_block ftrace_module_exit_nb = { + .notifier_call = ftrace_module_notify_exit, + .priority = INT_MIN, /* Run after anything that can remove kprobes */ +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -4058,9 +4072,13 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); + ret = register_module_notifier(&ftrace_module_enter_nb); + if (ret) + pr_warning("Failed to register trace ftrace module enter notifier\n"); + + ret = register_module_notifier(&ftrace_module_exit_nb); if (ret) - pr_warning("Failed to register trace ftrace module notifier\n"); + pr_warning("Failed to register trace ftrace module exit notifier\n"); set_ftrace_early_filters(); diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index f55fcf61b223..1c71382b283d 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,8 +13,5 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -#ifdef EVENT_POWER_TRACING_DEPRECATED -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5329e13e74a1..7a809e321058 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ @@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name } #endif +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + * *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + if (unlikely(arch_trace_is_compat_syscall(regs))) + return -1; + + return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -276,10 +309,10 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; - int size; int syscall_nr; + int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_enter_syscalls)) @@ -313,7 +346,7 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct ring_buffer *buffer; int syscall_nr; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_exit_syscalls)) @@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) @@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) diff --git a/kernel/user.c b/kernel/user.c index 33acb5e53a5f..57ebfd42023c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -47,9 +47,7 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, - .kref = { - .refcount = ATOMIC_INIT(3), - }, + .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2b042c42fbc4..8b650837083e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -78,7 +78,7 @@ int create_user_ns(struct cred *new) return ret; } - kref_init(&ns->kref); + atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; @@ -104,15 +104,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return create_user_ns(cred); } -void free_user_ns(struct kref *kref) +void free_user_ns(struct user_namespace *ns) { - struct user_namespace *parent, *ns = - container_of(kref, struct user_namespace, kref); + struct user_namespace *parent; - parent = ns->parent; - proc_free_inum(ns->proc_inum); - kmem_cache_free(user_ns_cachep, ns); - put_user_ns(parent); + do { + parent = ns->parent; + proc_free_inum(ns->proc_inum); + kmem_cache_free(user_ns_cachep, ns); + ns = parent; + } while (atomic_dec_and_test(&parent->count)); } EXPORT_SYMBOL(free_user_ns); @@ -519,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; +static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +{ + u32 upper_first, lower_first, upper_last, lower_last; + unsigned idx; + + upper_first = extent->first; + lower_first = extent->lower_first; + upper_last = upper_first + extent->count - 1; + lower_last = lower_first + extent->count - 1; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + u32 prev_upper_first, prev_lower_first; + u32 prev_upper_last, prev_lower_last; + struct uid_gid_extent *prev; + + prev = &new_map->extent[idx]; + + prev_upper_first = prev->first; + prev_lower_first = prev->lower_first; + prev_upper_last = prev_upper_first + prev->count - 1; + prev_lower_last = prev_lower_first + prev->count - 1; + + /* Does the upper range intersect a previous extent? */ + if ((prev_upper_first <= upper_last) && + (prev_upper_last >= upper_first)) + return true; + + /* Does the lower range intersect a previous extent? */ + if ((prev_lower_first <= lower_last) && + (prev_lower_last >= lower_first)) + return true; + } + return false; +} + + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -531,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent, *last = NULL; + struct uid_gid_extent *extent = NULL; unsigned long page = 0; char *kbuf, *pos, *next_line; ssize_t ret = -EINVAL; @@ -634,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf, if ((extent->lower_first + extent->count) <= extent->lower_first) goto out; - /* For now only accept extents that are strictly in order */ - if (last && - (((last->first + last->count) > extent->first) || - ((last->lower_first + last->count) > extent->lower_first))) + /* Do the ranges in extent overlap any previous extents? */ + if (mappings_overlap(&new_map, extent)) goto out; new_map.nr_extents++; - last = extent; /* Fail if the file contains too many extents */ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 27689422aa92..4a944676358e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -113,9 +113,9 @@ static int get_softlockup_thresh(void) * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ -static unsigned long get_timestamp(int this_cpu) +static unsigned long get_timestamp(void) { - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ + return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) @@ -133,9 +133,7 @@ static void set_sample_period(void) /* Commands for resetting the watchdog */ static void __touch_watchdog(void) { - int this_cpu = smp_processor_id(); - - __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); + __this_cpu_write(watchdog_touch_ts, get_timestamp()); } void touch_softlockup_watchdog(void) @@ -196,7 +194,7 @@ static int is_hardlockup(void) static int is_softlockup(unsigned long touch_ts) { - unsigned long now = get_timestamp(smp_processor_id()); + unsigned long now = get_timestamp(); /* Warn about unreasonable delays: */ if (time_after(now, touch_ts + get_softlockup_thresh())) |