diff options
Diffstat (limited to 'kernel')
60 files changed, 3750 insertions, 3366 deletions
| diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100eaa..d2b32ac27a39 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH  config INLINE_SPIN_UNLOCK_IRQ  	def_bool y -	depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH +	depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ  config INLINE_SPIN_UNLOCK_IRQRESTORE  	def_bool y @@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH  config INLINE_READ_UNLOCK_IRQ  	def_bool y -	depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH +	depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ  config INLINE_READ_UNLOCK_IRQRESTORE  	def_bool y @@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH  config INLINE_WRITE_UNLOCK_IRQ  	def_bool y -	depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH +	depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ  config INLINE_WRITE_UNLOCK_IRQRESTORE  	def_bool y diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7c9e6ddb979..e5583d10a325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,9 +63,6 @@  #include <linux/atomic.h> -/* css deactivation bias, makes css->refcnt negative to deny new trygets */ -#define CSS_DEACT_BIAS		INT_MIN -  /*   * cgroup_mutex is the master lock.  Any modification to cgroup or its   * hierarchy must be performed while holding it. @@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);   */  #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,  #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { +static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {  #include <linux/cgroup_subsys.h>  };  /* - * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the - * subsystems that are otherwise unattached - it never has more than a - * single cgroup, and all tasks are part of that cgroup. + * The dummy hierarchy, reserved for the subsystems that are otherwise + * unattached - it never has more than a single cgroup, and all tasks are + * part of that cgroup.   */ -static struct cgroupfs_root rootnode; +static struct cgroupfs_root cgroup_dummy_root; + +/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ +static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;  /*   * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. @@ -186,18 +186,28 @@ struct cgroup_event {  /* The list of hierarchy roots */ -static LIST_HEAD(roots); -static int root_count; +static LIST_HEAD(cgroup_roots); +static int cgroup_root_count; -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); - -/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ -#define dummytop (&rootnode.top_cgroup) +/* + * Hierarchy ID allocation and mapping.  It follows the same exclusion + * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for + * writes, either for reads. + */ +static DEFINE_IDR(cgroup_hierarchy_idr);  static struct cgroup_name root_cgroup_name = { .name = "/" }; +/* + * Assign a monotonically increasing serial number to cgroups.  It + * guarantees cgroups with bigger numbers are newer than those with smaller + * numbers.  Also, as cgroups are always appended to the parent's + * ->children list, it guarantees that sibling cgroups are always sorted in + * the ascending serial number order on the list.  Protected by + * cgroup_mutex. + */ +static u64 cgroup_serial_nr_next = 1; +  /* This flag indicates whether tasks in the fork and exit paths should   * check for fork/exit handlers to call. This avoids us having to do   * extra work in the fork/exit path if none of the subsystems need to @@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };   */  static int need_forkexit_callback __read_mostly; +static void cgroup_offline_fn(struct work_struct *work);  static int cgroup_destroy_locked(struct cgroup *cgrp);  static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,  			      struct cftype cfts[], bool is_add); -static int css_unbias_refcnt(int refcnt) -{ -	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; -} - -/* the current nr of refs, always >= 0 whether @css is deactivated or not */ -static int css_refcnt(struct cgroup_subsys_state *css) -{ -	int v = atomic_read(&css->refcnt); - -	return css_unbias_refcnt(v); -} -  /* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_dead(const struct cgroup *cgrp)  { -	return test_bit(CGRP_REMOVED, &cgrp->flags); +	return test_bit(CGRP_DEAD, &cgrp->flags);  }  /** @@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);  } -/* - * for_each_subsys() allows you to iterate on each subsystem attached to - * an active hierarchy +/** + * for_each_subsys - iterate all loaded cgroup subsystems + * @ss: the iteration cursor + * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * + * Should be called under cgroup_mutex.   */ -#define for_each_subsys(_root, _ss) \ -list_for_each_entry(_ss, &_root->subsys_list, sibling) +#define for_each_subsys(ss, i)						\ +	for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++)			\ +		if (({ lockdep_assert_held(&cgroup_mutex);		\ +		       !((ss) = cgroup_subsys[i]); })) { }		\ +		else + +/** + * for_each_builtin_subsys - iterate all built-in cgroup subsystems + * @ss: the iteration cursor + * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end + * + * Bulit-in subsystems are always present and iteration itself doesn't + * require any synchronization. + */ +#define for_each_builtin_subsys(ss, i)					\ +	for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&		\ +	     (((ss) = cgroup_subsys[i]) || true); (i)++) + +/* iterate each subsystem attached to a hierarchy */ +#define for_each_root_subsys(root, ss)					\ +	list_for_each_entry((ss), &(root)->subsys_list, sibling) -/* for_each_active_root() allows you to iterate across the active hierarchies */ -#define for_each_active_root(_root) \ -list_for_each_entry(_root, &roots, root_list) +/* iterate across the active hierarchies */ +#define for_each_active_root(root)					\ +	list_for_each_entry((root), &cgroup_roots, root_list)  static inline struct cgroup *__d_cgrp(struct dentry *dentry)  { @@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)  static bool cgroup_lock_live_group(struct cgroup *cgrp)  {  	mutex_lock(&cgroup_mutex); -	if (cgroup_is_removed(cgrp)) { +	if (cgroup_is_dead(cgrp)) {  		mutex_unlock(&cgroup_mutex);  		return false;  	} @@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);  static DECLARE_WORK(release_agent_work, cgroup_release_agent);  static void check_for_release(struct cgroup *cgrp); -/* Link structure for associating css_set objects with cgroups */ -struct cg_cgroup_link { -	/* -	 * List running through cg_cgroup_links associated with a -	 * cgroup, anchored on cgroup->css_sets -	 */ -	struct list_head cgrp_link_list; -	struct cgroup *cgrp; -	/* -	 * List running through cg_cgroup_links pointing at a -	 * single css_set object, anchored on css_set->cg_links -	 */ -	struct list_head cg_link_list; -	struct css_set *cg; +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies.  In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { +	/* the cgroup and css_set this link associates */ +	struct cgroup		*cgrp; +	struct css_set		*cset; + +	/* list of cgrp_cset_links anchored at cgrp->cset_links */ +	struct list_head	cset_link; + +	/* list of cgrp_cset_links anchored at css_set->cgrp_links */ +	struct list_head	cgrp_link;  };  /* The default css_set - used by init and its children prior to any @@ -336,7 +360,7 @@ struct cg_cgroup_link {   */  static struct css_set init_css_set; -static struct cg_cgroup_link init_css_set_link; +static struct cgrp_cset_link init_cgrp_cset_link;  static int cgroup_init_idr(struct cgroup_subsys *ss,  			   struct cgroup_subsys_state *css); @@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);  static unsigned long css_set_hash(struct cgroup_subsys_state *css[])  { -	int i;  	unsigned long key = 0UL; +	struct cgroup_subsys *ss; +	int i; -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) +	for_each_subsys(ss, i)  		key += (unsigned long)css[i];  	key = (key >> 16) ^ key; @@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])   * compiled into their kernel but not actually in use */  static int use_task_css_set_links __read_mostly; -static void __put_css_set(struct css_set *cg, int taskexit) +static void __put_css_set(struct css_set *cset, int taskexit)  { -	struct cg_cgroup_link *link; -	struct cg_cgroup_link *saved_link; +	struct cgrp_cset_link *link, *tmp_link; +  	/*  	 * Ensure that the refcount doesn't hit zero while any readers  	 * can see it. Similar to atomic_dec_and_lock(), but for an  	 * rwlock  	 */ -	if (atomic_add_unless(&cg->refcount, -1, 1)) +	if (atomic_add_unless(&cset->refcount, -1, 1))  		return;  	write_lock(&css_set_lock); -	if (!atomic_dec_and_test(&cg->refcount)) { +	if (!atomic_dec_and_test(&cset->refcount)) {  		write_unlock(&css_set_lock);  		return;  	}  	/* This css_set is dead. unlink it and release cgroup refcounts */ -	hash_del(&cg->hlist); +	hash_del(&cset->hlist);  	css_set_count--; -	list_for_each_entry_safe(link, saved_link, &cg->cg_links, -				 cg_link_list) { +	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {  		struct cgroup *cgrp = link->cgrp; -		list_del(&link->cg_link_list); -		list_del(&link->cgrp_link_list); -		/* -		 * We may not be holding cgroup_mutex, and if cgrp->count is -		 * dropped to 0 the cgroup can be destroyed at any time, hence -		 * rcu_read_lock is used to keep it alive. -		 */ -		rcu_read_lock(); -		if (atomic_dec_and_test(&cgrp->count) && -		    notify_on_release(cgrp)) { +		list_del(&link->cset_link); +		list_del(&link->cgrp_link); + +		/* @cgrp can't go away while we're holding css_set_lock */ +		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {  			if (taskexit)  				set_bit(CGRP_RELEASABLE, &cgrp->flags);  			check_for_release(cgrp);  		} -		rcu_read_unlock();  		kfree(link);  	}  	write_unlock(&css_set_lock); -	kfree_rcu(cg, rcu_head); +	kfree_rcu(cset, rcu_head);  }  /*   * refcounted get/put for css_set objects   */ -static inline void get_css_set(struct css_set *cg) +static inline void get_css_set(struct css_set *cset)  { -	atomic_inc(&cg->refcount); +	atomic_inc(&cset->refcount);  } -static inline void put_css_set(struct css_set *cg) +static inline void put_css_set(struct css_set *cset)  { -	__put_css_set(cg, 0); +	__put_css_set(cset, 0);  } -static inline void put_css_set_taskexit(struct css_set *cg) +static inline void put_css_set_taskexit(struct css_set *cset)  { -	__put_css_set(cg, 1); +	__put_css_set(cset, 1);  } -/* +/**   * compare_css_sets - helper function for find_existing_css_set(). - * @cg: candidate css_set being tested - * @old_cg: existing css_set for a task + * @cset: candidate css_set being tested + * @old_cset: existing css_set for a task   * @new_cgrp: cgroup that's being entered by the task   * @template: desired set of css pointers in css_set (pre-calculated)   *   * Returns true if "cg" matches "old_cg" except for the hierarchy   * which "new_cgrp" belongs to, for which it should match "new_cgrp".   */ -static bool compare_css_sets(struct css_set *cg, -			     struct css_set *old_cg, +static bool compare_css_sets(struct css_set *cset, +			     struct css_set *old_cset,  			     struct cgroup *new_cgrp,  			     struct cgroup_subsys_state *template[])  {  	struct list_head *l1, *l2; -	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { +	if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {  		/* Not all subsystems matched */  		return false;  	} @@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,  	 * candidates.  	 */ -	l1 = &cg->cg_links; -	l2 = &old_cg->cg_links; +	l1 = &cset->cgrp_links; +	l2 = &old_cset->cgrp_links;  	while (1) { -		struct cg_cgroup_link *cgl1, *cgl2; -		struct cgroup *cg1, *cg2; +		struct cgrp_cset_link *link1, *link2; +		struct cgroup *cgrp1, *cgrp2;  		l1 = l1->next;  		l2 = l2->next;  		/* See if we reached the end - both lists are equal length. */ -		if (l1 == &cg->cg_links) { -			BUG_ON(l2 != &old_cg->cg_links); +		if (l1 == &cset->cgrp_links) { +			BUG_ON(l2 != &old_cset->cgrp_links);  			break;  		} else { -			BUG_ON(l2 == &old_cg->cg_links); +			BUG_ON(l2 == &old_cset->cgrp_links);  		}  		/* Locate the cgroups associated with these links. */ -		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); -		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); -		cg1 = cgl1->cgrp; -		cg2 = cgl2->cgrp; +		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); +		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); +		cgrp1 = link1->cgrp; +		cgrp2 = link2->cgrp;  		/* Hierarchies should be linked in the same order. */ -		BUG_ON(cg1->root != cg2->root); +		BUG_ON(cgrp1->root != cgrp2->root);  		/*  		 * If this hierarchy is the hierarchy of the cgroup @@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,  		 * hierarchy, then this css_set should point to the  		 * same cgroup as the old css_set.  		 */ -		if (cg1->root == new_cgrp->root) { -			if (cg1 != new_cgrp) +		if (cgrp1->root == new_cgrp->root) { +			if (cgrp1 != new_cgrp)  				return false;  		} else { -			if (cg1 != cg2) +			if (cgrp1 != cgrp2)  				return false;  		}  	}  	return true;  } -/* - * find_existing_css_set() is a helper for - * find_css_set(), and checks to see whether an existing - * css_set is suitable. - * - * oldcg: the cgroup group that we're using before the cgroup - * transition - * - * cgrp: the cgroup that we're moving into - * - * template: location in which to build the desired set of subsystem - * state objects for the new cgroup group +/** + * find_existing_css_set - init css array and find the matching css_set + * @old_cset: the css_set that we're using before the cgroup transition + * @cgrp: the cgroup that we're moving into + * @template: out param for the new set of csses, should be clear on entry   */ -static struct css_set *find_existing_css_set( -	struct css_set *oldcg, -	struct cgroup *cgrp, -	struct cgroup_subsys_state *template[]) +static struct css_set *find_existing_css_set(struct css_set *old_cset, +					struct cgroup *cgrp, +					struct cgroup_subsys_state *template[])  { -	int i;  	struct cgroupfs_root *root = cgrp->root; -	struct css_set *cg; +	struct cgroup_subsys *ss; +	struct css_set *cset;  	unsigned long key; +	int i;  	/*  	 * Build the set of subsystem state objects that we want to see in the  	 * new css_set. while subsystems can change globally, the entries here  	 * won't change, so no need for locking.  	 */ -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +	for_each_subsys(ss, i) {  		if (root->subsys_mask & (1UL << i)) {  			/* Subsystem is in this hierarchy. So we want  			 * the subsystem state from the new @@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(  		} else {  			/* Subsystem is not in this hierarchy, so we  			 * don't want to change the subsystem state */ -			template[i] = oldcg->subsys[i]; +			template[i] = old_cset->subsys[i];  		}  	}  	key = css_set_hash(template); -	hash_for_each_possible(css_set_table, cg, hlist, key) { -		if (!compare_css_sets(cg, oldcg, cgrp, template)) +	hash_for_each_possible(css_set_table, cset, hlist, key) { +		if (!compare_css_sets(cset, old_cset, cgrp, template))  			continue;  		/* This css_set matches what we need */ -		return cg; +		return cset;  	}  	/* No existing cgroup group matched */  	return NULL;  } -static void free_cg_links(struct list_head *tmp) +static void free_cgrp_cset_links(struct list_head *links_to_free)  { -	struct cg_cgroup_link *link; -	struct cg_cgroup_link *saved_link; +	struct cgrp_cset_link *link, *tmp_link; -	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { -		list_del(&link->cgrp_link_list); +	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { +		list_del(&link->cset_link);  		kfree(link);  	}  } -/* - * allocate_cg_links() allocates "count" cg_cgroup_link structures - * and chains them on tmp through their cgrp_link_list fields. Returns 0 on - * success or a negative error +/** + * allocate_cgrp_cset_links - allocate cgrp_cset_links + * @count: the number of links to allocate + * @tmp_links: list_head the allocated links are put on + * + * Allocate @count cgrp_cset_link structures and chain them on @tmp_links + * through ->cset_link.  Returns 0 on success or -errno.   */ -static int allocate_cg_links(int count, struct list_head *tmp) +static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)  { -	struct cg_cgroup_link *link; +	struct cgrp_cset_link *link;  	int i; -	INIT_LIST_HEAD(tmp); + +	INIT_LIST_HEAD(tmp_links); +  	for (i = 0; i < count; i++) { -		link = kmalloc(sizeof(*link), GFP_KERNEL); +		link = kzalloc(sizeof(*link), GFP_KERNEL);  		if (!link) { -			free_cg_links(tmp); +			free_cgrp_cset_links(tmp_links);  			return -ENOMEM;  		} -		list_add(&link->cgrp_link_list, tmp); +		list_add(&link->cset_link, tmp_links);  	}  	return 0;  }  /**   * link_css_set - a helper function to link a css_set to a cgroup - * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() - * @cg: the css_set to be linked + * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() + * @cset: the css_set to be linked   * @cgrp: the destination cgroup   */ -static void link_css_set(struct list_head *tmp_cg_links, -			 struct css_set *cg, struct cgroup *cgrp) +static void link_css_set(struct list_head *tmp_links, struct css_set *cset, +			 struct cgroup *cgrp)  { -	struct cg_cgroup_link *link; +	struct cgrp_cset_link *link; -	BUG_ON(list_empty(tmp_cg_links)); -	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, -				cgrp_link_list); -	link->cg = cg; +	BUG_ON(list_empty(tmp_links)); +	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); +	link->cset = cset;  	link->cgrp = cgrp; -	atomic_inc(&cgrp->count); -	list_move(&link->cgrp_link_list, &cgrp->css_sets); +	list_move(&link->cset_link, &cgrp->cset_links);  	/*  	 * Always add links to the tail of the list so that the list  	 * is sorted by order of hierarchy creation  	 */ -	list_add_tail(&link->cg_link_list, &cg->cg_links); +	list_add_tail(&link->cgrp_link, &cset->cgrp_links);  } -/* - * find_css_set() takes an existing cgroup group and a - * cgroup object, and returns a css_set object that's - * equivalent to the old group, but with the given cgroup - * substituted into the appropriate hierarchy. Must be called with - * cgroup_mutex held +/** + * find_css_set - return a new css_set with one cgroup updated + * @old_cset: the baseline css_set + * @cgrp: the cgroup to be updated + * + * Return a new css_set that's equivalent to @old_cset, but with @cgrp + * substituted into the appropriate hierarchy.   */ -static struct css_set *find_css_set( -	struct css_set *oldcg, struct cgroup *cgrp) +static struct css_set *find_css_set(struct css_set *old_cset, +				    struct cgroup *cgrp)  { -	struct css_set *res; -	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - -	struct list_head tmp_cg_links; - -	struct cg_cgroup_link *link; +	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; +	struct css_set *cset; +	struct list_head tmp_links; +	struct cgrp_cset_link *link;  	unsigned long key; +	lockdep_assert_held(&cgroup_mutex); +  	/* First see if we already have a cgroup group that matches  	 * the desired set */  	read_lock(&css_set_lock); -	res = find_existing_css_set(oldcg, cgrp, template); -	if (res) -		get_css_set(res); +	cset = find_existing_css_set(old_cset, cgrp, template); +	if (cset) +		get_css_set(cset);  	read_unlock(&css_set_lock); -	if (res) -		return res; +	if (cset) +		return cset; -	res = kmalloc(sizeof(*res), GFP_KERNEL); -	if (!res) +	cset = kzalloc(sizeof(*cset), GFP_KERNEL); +	if (!cset)  		return NULL; -	/* Allocate all the cg_cgroup_link objects that we'll need */ -	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { -		kfree(res); +	/* Allocate all the cgrp_cset_link objects that we'll need */ +	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { +		kfree(cset);  		return NULL;  	} -	atomic_set(&res->refcount, 1); -	INIT_LIST_HEAD(&res->cg_links); -	INIT_LIST_HEAD(&res->tasks); -	INIT_HLIST_NODE(&res->hlist); +	atomic_set(&cset->refcount, 1); +	INIT_LIST_HEAD(&cset->cgrp_links); +	INIT_LIST_HEAD(&cset->tasks); +	INIT_HLIST_NODE(&cset->hlist);  	/* Copy the set of subsystem state objects generated in  	 * find_existing_css_set() */ -	memcpy(res->subsys, template, sizeof(res->subsys)); +	memcpy(cset->subsys, template, sizeof(cset->subsys));  	write_lock(&css_set_lock);  	/* Add reference counts and links from the new css_set. */ -	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { +	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {  		struct cgroup *c = link->cgrp; +  		if (c->root == cgrp->root)  			c = cgrp; -		link_css_set(&tmp_cg_links, res, c); +		link_css_set(&tmp_links, cset, c);  	} -	BUG_ON(!list_empty(&tmp_cg_links)); +	BUG_ON(!list_empty(&tmp_links));  	css_set_count++;  	/* Add this cgroup group to the hash table */ -	key = css_set_hash(res->subsys); -	hash_add(css_set_table, &res->hlist, key); +	key = css_set_hash(cset->subsys); +	hash_add(css_set_table, &cset->hlist, key);  	write_unlock(&css_set_lock); -	return res; +	return cset;  }  /* @@ -699,7 +714,7 @@ static struct css_set *find_css_set(  static struct cgroup *task_cgroup_from_root(struct task_struct *task,  					    struct cgroupfs_root *root)  { -	struct css_set *css; +	struct css_set *cset;  	struct cgroup *res = NULL;  	BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,  	 * task can't change groups, so the only thing that can happen  	 * is that it exits and its css is set back to init_css_set.  	 */ -	css = task->cgroups; -	if (css == &init_css_set) { +	cset = task_css_set(task); +	if (cset == &init_css_set) {  		res = &root->top_cgroup;  	} else { -		struct cg_cgroup_link *link; -		list_for_each_entry(link, &css->cg_links, cg_link_list) { +		struct cgrp_cset_link *link; + +		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {  			struct cgroup *c = link->cgrp; +  			if (c->root == root) {  				res = c;  				break; @@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)  static void cgroup_free_fn(struct work_struct *work)  { -	struct cgroup *cgrp = container_of(work, struct cgroup, free_work); +	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);  	struct cgroup_subsys *ss;  	mutex_lock(&cgroup_mutex);  	/*  	 * Release the subsystem state objects.  	 */ -	for_each_subsys(cgrp->root, ss) +	for_each_root_subsys(cgrp->root, ss)  		ss->css_free(cgrp);  	cgrp->root->number_of_cgroups--; @@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head)  {  	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); -	schedule_work(&cgrp->free_work); +	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); +	schedule_work(&cgrp->destroy_work);  }  static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  	if (S_ISDIR(inode->i_mode)) {  		struct cgroup *cgrp = dentry->d_fsdata; -		BUG_ON(!(cgroup_is_removed(cgrp))); +		BUG_ON(!(cgroup_is_dead(cgrp)));  		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);  	} else {  		struct cfent *cfe = __d_cfe(dentry); @@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,  	struct cgroup *cgrp = __d_cgrp(dir);  	struct cgroup_subsys *ss; -	for_each_subsys(cgrp->root, ss) { +	for_each_root_subsys(cgrp->root, ss) {  		struct cftype_set *set;  		if (!test_bit(ss->subsys_id, &subsys_mask))  			continue; @@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)   * returns an error, no reference counts are touched.   */  static int rebind_subsystems(struct cgroupfs_root *root, -			      unsigned long final_subsys_mask) +			     unsigned long added_mask, unsigned removed_mask)  { -	unsigned long added_mask, removed_mask;  	struct cgroup *cgrp = &root->top_cgroup; +	struct cgroup_subsys *ss;  	int i;  	BUG_ON(!mutex_is_locked(&cgroup_mutex));  	BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); -	removed_mask = root->actual_subsys_mask & ~final_subsys_mask; -	added_mask = final_subsys_mask & ~root->actual_subsys_mask;  	/* Check that any added subsystems are currently free */ -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +	for_each_subsys(ss, i) {  		unsigned long bit = 1UL << i; -		struct cgroup_subsys *ss = subsys[i]; +  		if (!(bit & added_mask))  			continue; -		/* -		 * Nobody should tell us to do a subsys that doesn't exist: -		 * parse_cgroupfs_options should catch that case and refcounts -		 * ensure that subsystems won't disappear once selected. -		 */ -		BUG_ON(ss == NULL); -		if (ss->root != &rootnode) { + +		if (ss->root != &cgroup_dummy_root) {  			/* Subsystem isn't free */  			return -EBUSY;  		} @@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,  		return -EBUSY;  	/* Process each subsystem */ -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		struct cgroup_subsys *ss = subsys[i]; +	for_each_subsys(ss, i) {  		unsigned long bit = 1UL << i; +  		if (bit & added_mask) {  			/* We're binding this subsystem to this hierarchy */ -			BUG_ON(ss == NULL);  			BUG_ON(cgrp->subsys[i]); -			BUG_ON(!dummytop->subsys[i]); -			BUG_ON(dummytop->subsys[i]->cgroup != dummytop); -			cgrp->subsys[i] = dummytop->subsys[i]; +			BUG_ON(!cgroup_dummy_top->subsys[i]); +			BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + +			cgrp->subsys[i] = cgroup_dummy_top->subsys[i];  			cgrp->subsys[i]->cgroup = cgrp;  			list_move(&ss->sibling, &root->subsys_list);  			ss->root = root;  			if (ss->bind)  				ss->bind(cgrp); +  			/* refcount was already taken, and we're keeping it */ +			root->subsys_mask |= bit;  		} else if (bit & removed_mask) {  			/* We're removing this subsystem */ -			BUG_ON(ss == NULL); -			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); +			BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp); +  			if (ss->bind) -				ss->bind(dummytop); -			dummytop->subsys[i]->cgroup = dummytop; +				ss->bind(cgroup_dummy_top); +			cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;  			cgrp->subsys[i] = NULL; -			subsys[i]->root = &rootnode; -			list_move(&ss->sibling, &rootnode.subsys_list); +			cgroup_subsys[i]->root = &cgroup_dummy_root; +			list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); +  			/* subsystem is now free - drop reference on module */  			module_put(ss->module); -		} else if (bit & final_subsys_mask) { +			root->subsys_mask &= ~bit; +		} else if (bit & root->subsys_mask) {  			/* Subsystem state should already exist */ -			BUG_ON(ss == NULL);  			BUG_ON(!cgrp->subsys[i]);  			/*  			 * a refcount was taken, but we already had one, so @@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,  			BUG_ON(cgrp->subsys[i]);  		}  	} -	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; + +	/* +	 * Mark @root has finished binding subsystems.  @root->subsys_mask +	 * now matches the bound subsystems. +	 */ +	root->flags |= CGRP_ROOT_SUBSYS_BOUND;  	return 0;  } @@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)  	struct cgroup_subsys *ss;  	mutex_lock(&cgroup_root_mutex); -	for_each_subsys(root, ss) +	for_each_root_subsys(root, ss)  		seq_printf(seq, ",%s", ss->name);  	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)  		seq_puts(seq, ",sane_behavior"); @@ -1114,18 +1133,19 @@ struct cgroup_sb_opts {  };  /* - * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call - * with cgroup_mutex held to protect the subsys[] array. This function takes - * refcounts on subsystems to be used, unless it returns error, in which case - * no refcounts are taken. + * Convert a hierarchy specifier into a bitmask of subsystems and + * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] + * array. This function takes refcounts on subsystems to be used, unless it + * returns error, in which case no refcounts are taken.   */  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  {  	char *token, *o = data;  	bool all_ss = false, one_ss = false;  	unsigned long mask = (unsigned long)-1; -	int i;  	bool module_pin_failed = false; +	struct cgroup_subsys *ss; +	int i;  	BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			continue;  		} -		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -			struct cgroup_subsys *ss = subsys[i]; -			if (ss == NULL) -				continue; +		for_each_subsys(ss, i) {  			if (strcmp(token, ss->name))  				continue;  			if (ss->disabled) @@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	 * otherwise if 'none', 'name=' and a subsystem name options  	 * were not specified, let's default to 'all'  	 */ -	if (all_ss || (!one_ss && !opts->none && !opts->name)) { -		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -			struct cgroup_subsys *ss = subsys[i]; -			if (ss == NULL) -				continue; -			if (ss->disabled) -				continue; -			set_bit(i, &opts->subsys_mask); -		} -	} +	if (all_ss || (!one_ss && !opts->none && !opts->name)) +		for_each_subsys(ss, i) +			if (!ss->disabled) +				set_bit(i, &opts->subsys_mask);  	/* Consistency checks */ @@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	 * take duplicate reference counts on a subsystem that's already used,  	 * but rebind_subsystems handles this case.  	 */ -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		unsigned long bit = 1UL << i; - -		if (!(bit & opts->subsys_mask)) +	for_each_subsys(ss, i) { +		if (!(opts->subsys_mask & (1UL << i)))  			continue; -		if (!try_module_get(subsys[i]->module)) { +		if (!try_module_get(cgroup_subsys[i]->module)) {  			module_pin_failed = true;  			break;  		} @@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			if (!(bit & opts->subsys_mask))  				continue; -			module_put(subsys[i]->module); +			module_put(cgroup_subsys[i]->module);  		}  		return -ENOENT;  	} @@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  static void drop_parsed_module_refcounts(unsigned long subsys_mask)  { +	struct cgroup_subsys *ss;  	int i; -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		unsigned long bit = 1UL << i; -		if (!(bit & subsys_mask)) -			continue; -		module_put(subsys[i]->module); -	} +	mutex_lock(&cgroup_mutex); +	for_each_subsys(ss, i) +		if (subsys_mask & (1UL << i)) +			module_put(cgroup_subsys[i]->module); +	mutex_unlock(&cgroup_mutex);  }  static int cgroup_remount(struct super_block *sb, int *flags, char *data) @@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	if (ret)  		goto out_unlock; -	if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) +	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)  		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",  			   task_tgid_nr(current), current->comm); @@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	removed_mask = root->subsys_mask & ~opts.subsys_mask;  	/* Don't allow flags or name to change at remount */ -	if (opts.flags != root->flags || +	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||  	    (opts.name && strcmp(opts.name, root->name))) { +		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", +		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", +		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);  		ret = -EINVAL; -		drop_parsed_module_refcounts(opts.subsys_mask);  		goto out_unlock;  	} @@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	 */  	cgroup_clear_directory(cgrp->dentry, false, removed_mask); -	ret = rebind_subsystems(root, opts.subsys_mask); +	ret = rebind_subsystems(root, added_mask, removed_mask);  	if (ret) {  		/* rebind_subsystems failed, re-populate the removed files */  		cgroup_populate_dir(cgrp, false, removed_mask); -		drop_parsed_module_refcounts(opts.subsys_mask);  		goto out_unlock;  	} @@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	mutex_unlock(&cgroup_root_mutex);  	mutex_unlock(&cgroup_mutex);  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex); +	if (ret) +		drop_parsed_module_refcounts(opts.subsys_mask);  	return ret;  } @@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->sibling);  	INIT_LIST_HEAD(&cgrp->children);  	INIT_LIST_HEAD(&cgrp->files); -	INIT_LIST_HEAD(&cgrp->css_sets); -	INIT_LIST_HEAD(&cgrp->allcg_node); +	INIT_LIST_HEAD(&cgrp->cset_links);  	INIT_LIST_HEAD(&cgrp->release_list);  	INIT_LIST_HEAD(&cgrp->pidlists); -	INIT_WORK(&cgrp->free_work, cgroup_free_fn);  	mutex_init(&cgrp->pidlist_mutex);  	INIT_LIST_HEAD(&cgrp->event_list);  	spin_lock_init(&cgrp->event_list_lock); @@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)  	INIT_LIST_HEAD(&root->subsys_list);  	INIT_LIST_HEAD(&root->root_list); -	INIT_LIST_HEAD(&root->allcg_list);  	root->number_of_cgroups = 1;  	cgrp->root = root; -	cgrp->name = &root_cgroup_name; +	RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);  	init_cgroup_housekeeping(cgrp); -	list_add_tail(&cgrp->allcg_node, &root->allcg_list);  } -static bool init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)  { -	int ret = 0; +	int id; -	do { -		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) -			return false; -		spin_lock(&hierarchy_id_lock); -		/* Try to allocate the next unused ID */ -		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, -					&root->hierarchy_id); -		if (ret == -ENOSPC) -			/* Try again starting from 0 */ -			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); -		if (!ret) { -			next_hierarchy_id = root->hierarchy_id + 1; -		} else if (ret != -EAGAIN) { -			/* Can only get here if the 31-bit IDR is full ... */ -			BUG_ON(ret); -		} -		spin_unlock(&hierarchy_id_lock); -	} while (ret); -	return true; +	lockdep_assert_held(&cgroup_mutex); +	lockdep_assert_held(&cgroup_root_mutex); + +	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, +			      GFP_KERNEL); +	if (id < 0) +		return id; + +	root->hierarchy_id = id; +	return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ +	lockdep_assert_held(&cgroup_mutex); +	lockdep_assert_held(&cgroup_root_mutex); + +	if (root->hierarchy_id) { +		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); +		root->hierarchy_id = 0; +	}  }  static int cgroup_test_super(struct super_block *sb, void *data) @@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)  	if (!root)  		return ERR_PTR(-ENOMEM); -	if (!init_root_id(root)) { -		kfree(root); -		return ERR_PTR(-ENOMEM); -	}  	init_cgroup_root(root); +	/* +	 * We need to set @root->subsys_mask now so that @root can be +	 * matched by cgroup_test_super() before it finishes +	 * initialization; otherwise, competing mounts with the same +	 * options may try to bind the same subsystems instead of waiting +	 * for the first one leading to unexpected mount errors. +	 * SUBSYS_BOUND will be set once actual binding is complete. +	 */  	root->subsys_mask = opts->subsys_mask;  	root->flags = opts->flags;  	ida_init(&root->cgroup_ida); @@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)  	return root;  } -static void cgroup_drop_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroupfs_root *root)  { -	if (!root) -		return; +	if (root) { +		/* hierarhcy ID shoulid already have been released */ +		WARN_ON_ONCE(root->hierarchy_id); -	BUG_ON(!root->hierarchy_id); -	spin_lock(&hierarchy_id_lock); -	ida_remove(&hierarchy_ida, root->hierarchy_id); -	spin_unlock(&hierarchy_id_lock); -	ida_destroy(&root->cgroup_ida); -	kfree(root); +		ida_destroy(&root->cgroup_ida); +		kfree(root); +	}  }  static int cgroup_set_super(struct super_block *sb, void *data) @@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);  	if (IS_ERR(sb)) {  		ret = PTR_ERR(sb); -		cgroup_drop_root(opts.new_root); +		cgroup_free_root(opts.new_root);  		goto drop_modules;  	} @@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	BUG_ON(!root);  	if (root == opts.new_root) {  		/* We used the new root structure, so this is a new hierarchy */ -		struct list_head tmp_cg_links; +		struct list_head tmp_links;  		struct cgroup *root_cgrp = &root->top_cgroup;  		struct cgroupfs_root *existing_root;  		const struct cred *cred;  		int i; -		struct css_set *cg; +		struct css_set *cset;  		BUG_ON(sb->s_root != NULL); @@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		 * that's us. The worst that can happen is that we  		 * have some link structures left over  		 */ -		ret = allocate_cg_links(css_set_count, &tmp_cg_links); +		ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);  		if (ret)  			goto unlock_drop; -		ret = rebind_subsystems(root, root->subsys_mask); +		/* ID 0 is reserved for dummy root, 1 for unified hierarchy */ +		ret = cgroup_init_root_id(root, 2, 0); +		if (ret) +			goto unlock_drop; + +		ret = rebind_subsystems(root, root->subsys_mask, 0);  		if (ret == -EBUSY) { -			free_cg_links(&tmp_cg_links); +			free_cgrp_cset_links(&tmp_links);  			goto unlock_drop;  		}  		/* @@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		/* EBUSY should be the only error here */  		BUG_ON(ret); -		list_add(&root->root_list, &roots); -		root_count++; +		list_add(&root->root_list, &cgroup_roots); +		cgroup_root_count++;  		sb->s_root->d_fsdata = root_cgrp;  		root->top_cgroup.dentry = sb->s_root; @@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		/* Link the top cgroup in this hierarchy into all  		 * the css_set objects */  		write_lock(&css_set_lock); -		hash_for_each(css_set_table, i, cg, hlist) -			link_css_set(&tmp_cg_links, cg, root_cgrp); +		hash_for_each(css_set_table, i, cset, hlist) +			link_css_set(&tmp_links, cset, root_cgrp);  		write_unlock(&css_set_lock); -		free_cg_links(&tmp_cg_links); +		free_cgrp_cset_links(&tmp_links);  		BUG_ON(!list_empty(&root_cgrp->children));  		BUG_ON(root->number_of_cgroups != 1); @@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		 * We re-used an existing hierarchy - the new root (if  		 * any) is not needed  		 */ -		cgroup_drop_root(opts.new_root); +		cgroup_free_root(opts.new_root); -		if (root->flags != opts.flags) { +		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {  			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {  				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");  				ret = -EINVAL; @@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	return dget(sb->s_root);   unlock_drop: +	cgroup_exit_root_id(root);  	mutex_unlock(&cgroup_root_mutex);  	mutex_unlock(&cgroup_mutex);  	mutex_unlock(&inode->i_mutex); @@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  static void cgroup_kill_sb(struct super_block *sb) {  	struct cgroupfs_root *root = sb->s_fs_info;  	struct cgroup *cgrp = &root->top_cgroup; +	struct cgrp_cset_link *link, *tmp_link;  	int ret; -	struct cg_cgroup_link *link; -	struct cg_cgroup_link *saved_link;  	BUG_ON(!root); @@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) {  	mutex_lock(&cgroup_root_mutex);  	/* Rebind all subsystems back to the default hierarchy */ -	ret = rebind_subsystems(root, 0); -	/* Shouldn't be able to fail ... */ -	BUG_ON(ret); +	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { +		ret = rebind_subsystems(root, 0, root->subsys_mask); +		/* Shouldn't be able to fail ... */ +		BUG_ON(ret); +	}  	/* -	 * Release all the links from css_sets to this hierarchy's +	 * Release all the links from cset_links to this hierarchy's  	 * root cgroup  	 */  	write_lock(&css_set_lock); -	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, -				 cgrp_link_list) { -		list_del(&link->cg_link_list); -		list_del(&link->cgrp_link_list); +	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { +		list_del(&link->cset_link); +		list_del(&link->cgrp_link);  		kfree(link);  	}  	write_unlock(&css_set_lock);  	if (!list_empty(&root->root_list)) {  		list_del(&root->root_list); -		root_count--; +		cgroup_root_count--;  	} +	cgroup_exit_root_id(root); +  	mutex_unlock(&cgroup_root_mutex);  	mutex_unlock(&cgroup_mutex);  	simple_xattrs_free(&cgrp->xattrs);  	kill_litter_super(sb); -	cgroup_drop_root(root); +	cgroup_free_root(root);  }  static struct file_system_type cgroup_fs_type = { @@ -1825,6 +1845,38 @@ out:  }  EXPORT_SYMBOL_GPL(cgroup_path); +/** + * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * @task: target task + * @hierarchy_id: the hierarchy to look up @task's cgroup from + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and + * copy its path into @buf.  This function grabs cgroup_mutex and shouldn't + * be used inside locks used by cgroup controller callbacks. + */ +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, +				    char *buf, size_t buflen) +{ +	struct cgroupfs_root *root; +	struct cgroup *cgrp = NULL; +	int ret = -ENOENT; + +	mutex_lock(&cgroup_mutex); + +	root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); +	if (root) { +		cgrp = task_cgroup_from_root(task, root); +		ret = cgroup_path(cgrp, buf, buflen); +	} + +	mutex_unlock(&cgroup_mutex); + +	return ret; +} +EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); +  /*   * Control Group taskset   */ @@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);   *   * Must be called with cgroup_mutex and threadgroup locked.   */ -static void cgroup_task_migrate(struct cgroup *oldcgrp, -				struct task_struct *tsk, struct css_set *newcg) +static void cgroup_task_migrate(struct cgroup *old_cgrp, +				struct task_struct *tsk, +				struct css_set *new_cset)  { -	struct css_set *oldcg; +	struct css_set *old_cset;  	/*  	 * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,  	 * css_set to init_css_set and dropping the old one.  	 */  	WARN_ON_ONCE(tsk->flags & PF_EXITING); -	oldcg = tsk->cgroups; +	old_cset = task_css_set(tsk);  	task_lock(tsk); -	rcu_assign_pointer(tsk->cgroups, newcg); +	rcu_assign_pointer(tsk->cgroups, new_cset);  	task_unlock(tsk);  	/* Update the css_set linked lists if we're using them */  	write_lock(&css_set_lock);  	if (!list_empty(&tsk->cg_list)) -		list_move(&tsk->cg_list, &newcg->tasks); +		list_move(&tsk->cg_list, &new_cset->tasks);  	write_unlock(&css_set_lock);  	/* -	 * We just gained a reference on oldcg by taking it from the task. As -	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop -	 * it here; it will be freed under RCU. +	 * We just gained a reference on old_cset by taking it from the +	 * task. As trading it for new_cset is protected by cgroup_mutex, +	 * we're safe to drop it here; it will be freed under RCU.  	 */ -	set_bit(CGRP_RELEASABLE, &oldcgrp->flags); -	put_css_set(oldcg); +	set_bit(CGRP_RELEASABLE, &old_cgrp->flags); +	put_css_set(old_cset);  }  /** @@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,  	/*  	 * step 1: check that we can legitimately attach to the cgroup.  	 */ -	for_each_subsys(root, ss) { +	for_each_root_subsys(root, ss) {  		if (ss->can_attach) {  			retval = ss->can_attach(cgrp, &tset);  			if (retval) { @@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,  	 * we use find_css_set, which allocates a new one if necessary.  	 */  	for (i = 0; i < group_size; i++) { +		struct css_set *old_cset; +  		tc = flex_array_get(group, i); -		tc->cg = find_css_set(tc->task->cgroups, cgrp); +		old_cset = task_css_set(tc->task); +		tc->cg = find_css_set(old_cset, cgrp);  		if (!tc->cg) {  			retval = -ENOMEM;  			goto out_put_css_set_refs; @@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,  	/*  	 * step 4: do subsystem attach callbacks.  	 */ -	for_each_subsys(root, ss) { +	for_each_root_subsys(root, ss) {  		if (ss->attach)  			ss->attach(cgrp, &tset);  	} @@ -2086,7 +2142,7 @@ out_put_css_set_refs:  	}  out_cancel_attach:  	if (retval) { -		for_each_subsys(root, ss) { +		for_each_root_subsys(root, ss) {  			if (ss == failed_ss)  				break;  			if (ss->cancel_attach) @@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,  	struct cftype *cft = __d_cft(file->f_dentry);  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); -	if (cgroup_is_removed(cgrp)) +	if (cgroup_is_dead(cgrp))  		return -ENODEV;  	if (cft->write)  		return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,  	struct cftype *cft = __d_cft(file->f_dentry);  	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); -	if (cgroup_is_removed(cgrp)) +	if (cgroup_is_dead(cgrp))  		return -ENODEV;  	if (cft->read) @@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)  	cft = __d_cft(file->f_dentry);  	if (cft->read_map || cft->read_seq_string) { -		struct cgroup_seqfile_state *state = -			kzalloc(sizeof(*state), GFP_USER); +		struct cgroup_seqfile_state *state; + +		state = kzalloc(sizeof(*state), GFP_USER);  		if (!state)  			return -ENOMEM; +  		state->cft = cft;  		state->cgroup = __d_cgrp(file->f_dentry->d_parent);  		file->f_op = &cgroup_seqfile_operations; @@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,  	cgrp = __d_cgrp(old_dentry); +	/* +	 * This isn't a proper migration and its usefulness is very +	 * limited.  Disallow if sane_behavior. +	 */ +	if (cgroup_sane_behavior(cgrp)) +		return -EPERM; +  	name = cgroup_alloc_name(new_dentry);  	if (!name)  		return -ENOMEM; @@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,  		return ret;  	} -	old_name = cgrp->name; +	old_name = rcu_dereference_protected(cgrp->name, true);  	rcu_assign_pointer(cgrp->name, name);  	kfree_rcu(old_name, rcu_head); @@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,  	return ret;  } -static DEFINE_MUTEX(cgroup_cft_mutex); -  static void cgroup_cfts_prepare(void) -	__acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) +	__acquires(&cgroup_mutex)  {  	/*  	 * Thanks to the entanglement with vfs inode locking, we can't walk  	 * the existing cgroups under cgroup_mutex and create files. -	 * Instead, we increment reference on all cgroups and build list of -	 * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure -	 * exclusive access to the field. +	 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU +	 * read lock before calling cgroup_addrm_files().  	 */ -	mutex_lock(&cgroup_cft_mutex);  	mutex_lock(&cgroup_mutex);  }  static void cgroup_cfts_commit(struct cgroup_subsys *ss,  			       struct cftype *cfts, bool is_add) -	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) +	__releases(&cgroup_mutex)  {  	LIST_HEAD(pending); -	struct cgroup *cgrp, *n; +	struct cgroup *cgrp, *root = &ss->root->top_cgroup; +	struct super_block *sb = ss->root->sb; +	struct dentry *prev = NULL; +	struct inode *inode; +	u64 update_before;  	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ -	if (cfts && ss->root != &rootnode) { -		list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { -			dget(cgrp->dentry); -			list_add_tail(&cgrp->cft_q_node, &pending); -		} +	if (!cfts || ss->root == &cgroup_dummy_root || +	    !atomic_inc_not_zero(&sb->s_active)) { +		mutex_unlock(&cgroup_mutex); +		return;  	} -	mutex_unlock(&cgroup_mutex); -  	/* -	 * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm -	 * files for all cgroups which were created before. +	 * All cgroups which are created after we drop cgroup_mutex will +	 * have the updated set of files, so we only need to update the +	 * cgroups created before the current @cgroup_serial_nr_next.  	 */ -	list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { -		struct inode *inode = cgrp->dentry->d_inode; +	update_before = cgroup_serial_nr_next; + +	mutex_unlock(&cgroup_mutex); + +	/* @root always needs to be updated */ +	inode = root->dentry->d_inode; +	mutex_lock(&inode->i_mutex); +	mutex_lock(&cgroup_mutex); +	cgroup_addrm_files(root, ss, cfts, is_add); +	mutex_unlock(&cgroup_mutex); +	mutex_unlock(&inode->i_mutex); + +	/* add/rm files for all cgroups created before */ +	rcu_read_lock(); +	cgroup_for_each_descendant_pre(cgrp, root) { +		if (cgroup_is_dead(cgrp)) +			continue; + +		inode = cgrp->dentry->d_inode; +		dget(cgrp->dentry); +		rcu_read_unlock(); + +		dput(prev); +		prev = cgrp->dentry;  		mutex_lock(&inode->i_mutex);  		mutex_lock(&cgroup_mutex); -		if (!cgroup_is_removed(cgrp)) +		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))  			cgroup_addrm_files(cgrp, ss, cfts, is_add);  		mutex_unlock(&cgroup_mutex);  		mutex_unlock(&inode->i_mutex); -		list_del_init(&cgrp->cft_q_node); -		dput(cgrp->dentry); +		rcu_read_lock();  	} - -	mutex_unlock(&cgroup_cft_mutex); +	rcu_read_unlock(); +	dput(prev); +	deactivate_super(sb);  }  /** @@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  	list_for_each_entry(set, &ss->cftsets, node) {  		if (set->cfts == cfts) { -			list_del_init(&set->node); +			list_del(&set->node); +			kfree(set);  			cgroup_cfts_commit(ss, cfts, false);  			return 0;  		} @@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  int cgroup_task_count(const struct cgroup *cgrp)  {  	int count = 0; -	struct cg_cgroup_link *link; +	struct cgrp_cset_link *link;  	read_lock(&css_set_lock); -	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { -		count += atomic_read(&link->cg->refcount); -	} +	list_for_each_entry(link, &cgrp->cset_links, cset_link) +		count += atomic_read(&link->cset->refcount);  	read_unlock(&css_set_lock);  	return count;  } @@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp)   * Advance a list_head iterator.  The iterator should be positioned at   * the start of a css_set   */ -static void cgroup_advance_iter(struct cgroup *cgrp, -				struct cgroup_iter *it) +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)  { -	struct list_head *l = it->cg_link; -	struct cg_cgroup_link *link; -	struct css_set *cg; +	struct list_head *l = it->cset_link; +	struct cgrp_cset_link *link; +	struct css_set *cset;  	/* Advance to the next non-empty css_set */  	do {  		l = l->next; -		if (l == &cgrp->css_sets) { -			it->cg_link = NULL; +		if (l == &cgrp->cset_links) { +			it->cset_link = NULL;  			return;  		} -		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); -		cg = link->cg; -	} while (list_empty(&cg->tasks)); -	it->cg_link = l; -	it->task = cg->tasks.next; +		link = list_entry(l, struct cgrp_cset_link, cset_link); +		cset = link->cset; +	} while (list_empty(&cset->tasks)); +	it->cset_link = l; +	it->task = cset->tasks.next;  }  /* @@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void)  		 * entry won't be deleted though the process has exited.  		 */  		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) -			list_add(&p->cg_list, &p->cgroups->tasks); +			list_add(&p->cg_list, &task_css_set(p)->tasks);  		task_unlock(p);  	} while_each_thread(g, p);  	read_unlock(&tasklist_lock); @@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void)  }  /** + * cgroup_next_sibling - find the next sibling of a given cgroup + * @pos: the current cgroup + * + * This function returns the next sibling of @pos and should be called + * under RCU read lock.  The only requirement is that @pos is accessible. + * The next sibling is guaranteed to be returned regardless of @pos's + * state. + */ +struct cgroup *cgroup_next_sibling(struct cgroup *pos) +{ +	struct cgroup *next; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	/* +	 * @pos could already have been removed.  Once a cgroup is removed, +	 * its ->sibling.next is no longer updated when its next sibling +	 * changes.  As CGRP_DEAD assertion is serialized and happens +	 * before the cgroup is taken off the ->sibling list, if we see it +	 * unasserted, it's guaranteed that the next sibling hasn't +	 * finished its grace period even if it's already removed, and thus +	 * safe to dereference from this RCU critical section.  If +	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed +	 * to be visible as %true here. +	 */ +	if (likely(!cgroup_is_dead(pos))) { +		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); +		if (&next->sibling != &pos->parent->children) +			return next; +		return NULL; +	} + +	/* +	 * Can't dereference the next pointer.  Each cgroup is given a +	 * monotonically increasing unique serial number and always +	 * appended to the sibling list, so the next one can be found by +	 * walking the parent's children until we see a cgroup with higher +	 * serial number than @pos's. +	 * +	 * While this path can be slow, it's taken only when either the +	 * current cgroup is removed or iteration and removal race. +	 */ +	list_for_each_entry_rcu(next, &pos->parent->children, sibling) +		if (next->serial_nr > pos->serial_nr) +			return next; +	return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_sibling); + +/**   * cgroup_next_descendant_pre - find the next descendant for pre-order walk   * @pos: the current position (%NULL to initiate traversal)   * @cgroup: cgroup whose descendants to walk   *   * To be used by cgroup_for_each_descendant_pre().  Find the next   * descendant to visit for pre-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section.  This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup.   */  struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,  					  struct cgroup *cgroup) @@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,  	/* no child, visit my or the closest ancestor's next sibling */  	while (pos != cgroup) { -		next = list_entry_rcu(pos->sibling.next, struct cgroup, -				      sibling); -		if (&next->sibling != &pos->parent->children) +		next = cgroup_next_sibling(pos); +		if (next)  			return next; -  		pos = pos->parent;  	} @@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);   * Return the rightmost descendant of @pos.  If there's no descendant,   * @pos is returned.  This can be used during pre-order traversal to skip   * subtree of @pos. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section.  This + * function will return the correct rightmost descendant as long as @pos is + * accessible.   */  struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)  { @@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)   *   * To be used by cgroup_for_each_descendant_post().  Find the next   * descendant to visit for post-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section.  This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup.   */  struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,  					   struct cgroup *cgroup) @@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,  	}  	/* if there's an unvisited sibling, visit its leftmost descendant */ -	next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); -	if (&next->sibling != &pos->parent->children) +	next = cgroup_next_sibling(pos); +	if (next)  		return cgroup_leftmost_descendant(next);  	/* no sibling left, visit parent */ @@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)  		cgroup_enable_task_cg_lists();  	read_lock(&css_set_lock); -	it->cg_link = &cgrp->css_sets; +	it->cset_link = &cgrp->cset_links;  	cgroup_advance_iter(cgrp, it);  } @@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,  {  	struct task_struct *res;  	struct list_head *l = it->task; -	struct cg_cgroup_link *link; +	struct cgrp_cset_link *link;  	/* If the iterator cg is NULL, we have no tasks */ -	if (!it->cg_link) +	if (!it->cset_link)  		return NULL;  	res = list_entry(l, struct task_struct, cg_list);  	/* Advance iterator to find next entry */  	l = l->next; -	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); -	if (l == &link->cg->tasks) { +	link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); +	if (l == &link->cset->tasks) {  		/* We reached the end of this task list - move on to  		 * the next cg_cgroup_link */  		cgroup_advance_iter(cgrp, it); @@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,  		}  	}  	/* entry not found; create a new one */ -	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); +	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);  	if (!l) {  		mutex_unlock(&cgrp->pidlist_mutex);  		return l; @@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,  	down_write(&l->mutex);  	l->key.type = type;  	l->key.ns = get_pid_ns(ns); -	l->use_count = 0; /* don't increment here */ -	l->list = NULL;  	l->owner = cgrp;  	list_add(&l->links, &cgrp->pidlists);  	mutex_unlock(&cgrp->pidlist_mutex); @@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,  }  /* + * When dput() is called asynchronously, if umount has been done and + * then deactivate_super() in cgroup_free_fn() kills the superblock, + * there's a small window that vfs will see the root dentry with non-zero + * refcnt and trigger BUG(). + * + * That's why we hold a reference before dput() and drop it right after. + */ +static void cgroup_dput(struct cgroup *cgrp) +{ +	struct super_block *sb = cgrp->root->sb; + +	atomic_inc(&sb->s_active); +	dput(cgrp->dentry); +	deactivate_super(sb); +} + +/*   * Unregister event and free resources.   *   * Gets called from workqueue. @@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work)  	eventfd_ctx_put(event->eventfd);  	kfree(event); -	dput(cgrp->dentry); +	cgroup_dput(cgrp);  }  /* @@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,  	return 0;  } -/* - * for the common functions, 'private' gives the type of file - */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." -static struct cftype files[] = { -	{ -		.name = "tasks", -		.open = cgroup_tasks_open, -		.write_u64 = cgroup_tasks_write, -		.release = cgroup_pidlist_release, -		.mode = S_IRUGO | S_IWUSR, -	}, +static struct cftype cgroup_base_files[] = {  	{ -		.name = CGROUP_FILE_GENERIC_PREFIX "procs", +		.name = "cgroup.procs",  		.open = cgroup_procs_open,  		.write_u64 = cgroup_procs_write,  		.release = cgroup_pidlist_release,  		.mode = S_IRUGO | S_IWUSR,  	},  	{ -		.name = "notify_on_release", -		.read_u64 = cgroup_read_notify_on_release, -		.write_u64 = cgroup_write_notify_on_release, -	}, -	{ -		.name = CGROUP_FILE_GENERIC_PREFIX "event_control", +		.name = "cgroup.event_control",  		.write_string = cgroup_write_event_control,  		.mode = S_IWUGO,  	}, @@ -3974,9 +4119,29 @@ static struct cftype files[] = {  		.flags = CFTYPE_ONLY_ON_ROOT,  		.read_seq_string = cgroup_sane_behavior_show,  	}, + +	/* +	 * Historical crazy stuff.  These don't have "cgroup."  prefix and +	 * don't exist if sane_behavior.  If you're depending on these, be +	 * prepared to be burned. +	 */ +	{ +		.name = "tasks", +		.flags = CFTYPE_INSANE,		/* use "procs" instead */ +		.open = cgroup_tasks_open, +		.write_u64 = cgroup_tasks_write, +		.release = cgroup_pidlist_release, +		.mode = S_IRUGO | S_IWUSR, +	}, +	{ +		.name = "notify_on_release", +		.flags = CFTYPE_INSANE, +		.read_u64 = cgroup_read_notify_on_release, +		.write_u64 = cgroup_write_notify_on_release, +	},  	{  		.name = "release_agent", -		.flags = CFTYPE_ONLY_ON_ROOT, +		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,  		.read_seq_string = cgroup_release_agent_show,  		.write_string = cgroup_release_agent_write,  		.max_write_len = PATH_MAX, @@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,  	struct cgroup_subsys *ss;  	if (base_files) { -		err = cgroup_addrm_files(cgrp, NULL, files, true); +		err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);  		if (err < 0)  			return err;  	}  	/* process cftsets of each subsystem */ -	for_each_subsys(cgrp->root, ss) { +	for_each_root_subsys(cgrp->root, ss) {  		struct cftype_set *set;  		if (!test_bit(ss->subsys_id, &subsys_mask))  			continue; @@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,  	}  	/* This cgroup is ready now */ -	for_each_subsys(cgrp->root, ss) { +	for_each_root_subsys(cgrp->root, ss) {  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; +		struct css_id *id = rcu_dereference_protected(css->id, true); +  		/*  		 * Update id->css pointer and make this css visible from  		 * CSS ID functions. This pointer will be dereferened  		 * from RCU-read-side without locks.  		 */ -		if (css->id) -			rcu_assign_pointer(css->id->css, css); +		if (id) +			rcu_assign_pointer(id->css, css);  	}  	return 0; @@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work)  {  	struct cgroup_subsys_state *css =  		container_of(work, struct cgroup_subsys_state, dput_work); -	struct dentry *dentry = css->cgroup->dentry; -	struct super_block *sb = dentry->d_sb; -	atomic_inc(&sb->s_active); -	dput(dentry); -	deactivate_super(sb); +	cgroup_dput(css->cgroup); +} + +static void css_release(struct percpu_ref *ref) +{ +	struct cgroup_subsys_state *css = +		container_of(ref, struct cgroup_subsys_state, refcnt); + +	schedule_work(&css->dput_work);  }  static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,  			       struct cgroup *cgrp)  {  	css->cgroup = cgrp; -	atomic_set(&css->refcnt, 1);  	css->flags = 0;  	css->id = NULL; -	if (cgrp == dummytop) +	if (cgrp == cgroup_dummy_top)  		css->flags |= CSS_ROOT;  	BUG_ON(cgrp->subsys[ss->subsys_id]);  	cgrp->subsys[ss->subsys_id] = css; @@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))  		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); -	for_each_subsys(root, ss) { +	for_each_root_subsys(root, ss) {  		struct cgroup_subsys_state *css;  		css = ss->css_alloc(cgrp); @@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  			err = PTR_ERR(css);  			goto err_free_all;  		} + +		err = percpu_ref_init(&css->refcnt, css_release); +		if (err) +			goto err_free_all; +  		init_cgroup_css(css, ss, cgrp); +  		if (ss->use_id) {  			err = alloc_css_id(ss, parent, cgrp);  			if (err) @@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  		goto err_free_all;  	lockdep_assert_held(&dentry->d_inode->i_mutex); +	cgrp->serial_nr = cgroup_serial_nr_next++; +  	/* allocation complete, commit to creation */ -	list_add_tail(&cgrp->allcg_node, &root->allcg_list);  	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);  	root->number_of_cgroups++;  	/* each css holds a ref to the cgroup's dentry */ -	for_each_subsys(root, ss) +	for_each_root_subsys(root, ss)  		dget(dentry);  	/* hold a ref to the parent's dentry */  	dget(parent->dentry);  	/* creation succeeded, notify subsystems */ -	for_each_subsys(root, ss) { +	for_each_root_subsys(root, ss) {  		err = online_css(ss, cgrp);  		if (err)  			goto err_destroy; @@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	return 0;  err_free_all: -	for_each_subsys(root, ss) { -		if (cgrp->subsys[ss->subsys_id]) +	for_each_root_subsys(root, ss) { +		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + +		if (css) { +			percpu_ref_cancel_init(&css->refcnt);  			ss->css_free(cgrp); +		}  	}  	mutex_unlock(&cgroup_mutex);  	/* Release the reference count that we took on the superblock */ @@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	return cgroup_create(c_parent, dentry, mode | S_IFDIR);  } +static void cgroup_css_killed(struct cgroup *cgrp) +{ +	if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) +		return; + +	/* percpu ref's of all css's are killed, kick off the next step */ +	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); +	schedule_work(&cgrp->destroy_work); +} + +static void css_ref_killed_fn(struct percpu_ref *ref) +{ +	struct cgroup_subsys_state *css = +		container_of(ref, struct cgroup_subsys_state, refcnt); + +	cgroup_css_killed(css->cgroup); +} + +/** + * cgroup_destroy_locked - the first stage of cgroup destruction + * @cgrp: cgroup to be destroyed + * + * css's make use of percpu refcnts whose killing latency shouldn't be + * exposed to userland and are RCU protected.  Also, cgroup core needs to + * guarantee that css_tryget() won't succeed by the time ->css_offline() is + * invoked.  To satisfy all the requirements, destruction is implemented in + * the following two steps. + * + * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all + *     userland visible parts and start killing the percpu refcnts of + *     css's.  Set up so that the next stage will be kicked off once all + *     the percpu refcnts are confirmed to be killed. + * + * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the + *     rest of destruction.  Once all cgroup references are gone, the + *     cgroup is RCU-freed. + * + * This function implements s1.  After this step, @cgrp is gone as far as + * the userland is concerned and a new cgroup with the same name may be + * created.  As cgroup doesn't care about the names internally, this + * doesn't cause any problem. + */  static int cgroup_destroy_locked(struct cgroup *cgrp)  	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)  {  	struct dentry *d = cgrp->dentry; -	struct cgroup *parent = cgrp->parent;  	struct cgroup_event *event, *tmp;  	struct cgroup_subsys *ss; +	bool empty;  	lockdep_assert_held(&d->d_inode->i_mutex);  	lockdep_assert_held(&cgroup_mutex); -	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) +	/* +	 * css_set_lock synchronizes access to ->cset_links and prevents +	 * @cgrp from being removed while __put_css_set() is in progress. +	 */ +	read_lock(&css_set_lock); +	empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); +	read_unlock(&css_set_lock); +	if (!empty)  		return -EBUSY;  	/* -	 * Block new css_tryget() by deactivating refcnt and mark @cgrp -	 * removed.  This makes future css_tryget() and child creation -	 * attempts fail thus maintaining the removal conditions verified -	 * above. +	 * Block new css_tryget() by killing css refcnts.  cgroup core +	 * guarantees that, by the time ->css_offline() is invoked, no new +	 * css reference will be given out via css_tryget().  We can't +	 * simply call percpu_ref_kill() and proceed to offlining css's +	 * because percpu_ref_kill() doesn't guarantee that the ref is seen +	 * as killed on all CPUs on return. +	 * +	 * Use percpu_ref_kill_and_confirm() to get notifications as each +	 * css is confirmed to be seen as killed on all CPUs.  The +	 * notification callback keeps track of the number of css's to be +	 * killed and schedules cgroup_offline_fn() to perform the rest of +	 * destruction once the percpu refs of all css's are confirmed to +	 * be killed.  	 */ -	for_each_subsys(cgrp->root, ss) { +	atomic_set(&cgrp->css_kill_cnt, 1); +	for_each_root_subsys(cgrp->root, ss) {  		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -		WARN_ON(atomic_read(&css->refcnt) < 0); -		atomic_add(CSS_DEACT_BIAS, &css->refcnt); -	} -	set_bit(CGRP_REMOVED, &cgrp->flags); +		/* +		 * Killing would put the base ref, but we need to keep it +		 * alive until after ->css_offline. +		 */ +		percpu_ref_get(&css->refcnt); -	/* tell subsystems to initate destruction */ -	for_each_subsys(cgrp->root, ss) -		offline_css(ss, cgrp); +		atomic_inc(&cgrp->css_kill_cnt); +		percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); +	} +	cgroup_css_killed(cgrp);  	/* -	 * Put all the base refs.  Each css holds an extra reference to the -	 * cgroup's dentry and cgroup removal proceeds regardless of css -	 * refs.  On the last put of each css, whenever that may be, the -	 * extra dentry ref is put so that dentry destruction happens only -	 * after all css's are released. +	 * Mark @cgrp dead.  This prevents further task migration and child +	 * creation by disabling cgroup_lock_live_group().  Note that +	 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to +	 * resume iteration after dropping RCU read lock.  See +	 * cgroup_next_sibling() for details.  	 */ -	for_each_subsys(cgrp->root, ss) -		css_put(cgrp->subsys[ss->subsys_id]); +	set_bit(CGRP_DEAD, &cgrp->flags); +	/* CGRP_DEAD is set, remove from ->release_list for the last time */  	raw_spin_lock(&release_list_lock);  	if (!list_empty(&cgrp->release_list))  		list_del_init(&cgrp->release_list);  	raw_spin_unlock(&release_list_lock); -	/* delete this cgroup from parent->children */ -	list_del_rcu(&cgrp->sibling); -	list_del_init(&cgrp->allcg_node); - +	/* +	 * Remove @cgrp directory.  The removal puts the base ref but we +	 * aren't quite done with @cgrp yet, so hold onto it. +	 */  	dget(d);  	cgroup_d_remove_dir(d); -	dput(d); - -	set_bit(CGRP_RELEASABLE, &parent->flags); -	check_for_release(parent);  	/*  	 * Unregister events and notify userspace. @@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	spin_unlock(&cgrp->event_list_lock);  	return 0; +}; + +/** + * cgroup_offline_fn - the second step of cgroup destruction + * @work: cgroup->destroy_free_work + * + * This function is invoked from a work item for a cgroup which is being + * destroyed after the percpu refcnts of all css's are guaranteed to be + * seen as killed on all CPUs, and performs the rest of destruction.  This + * is the second step of destruction described in the comment above + * cgroup_destroy_locked(). + */ +static void cgroup_offline_fn(struct work_struct *work) +{ +	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); +	struct cgroup *parent = cgrp->parent; +	struct dentry *d = cgrp->dentry; +	struct cgroup_subsys *ss; + +	mutex_lock(&cgroup_mutex); + +	/* +	 * css_tryget() is guaranteed to fail now.  Tell subsystems to +	 * initate destruction. +	 */ +	for_each_root_subsys(cgrp->root, ss) +		offline_css(ss, cgrp); + +	/* +	 * Put the css refs from cgroup_destroy_locked().  Each css holds +	 * an extra reference to the cgroup's dentry and cgroup removal +	 * proceeds regardless of css refs.  On the last put of each css, +	 * whenever that may be, the extra dentry ref is put so that dentry +	 * destruction happens only after all css's are released. +	 */ +	for_each_root_subsys(cgrp->root, ss) +		css_put(cgrp->subsys[ss->subsys_id]); + +	/* delete this cgroup from parent->children */ +	list_del_rcu(&cgrp->sibling); + +	dput(d); + +	set_bit(CGRP_RELEASABLE, &parent->flags); +	check_for_release(parent); + +	mutex_unlock(&cgroup_mutex);  }  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) @@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  	cgroup_init_cftsets(ss);  	/* Create the top cgroup state for this subsystem */ -	list_add(&ss->sibling, &rootnode.subsys_list); -	ss->root = &rootnode; -	css = ss->css_alloc(dummytop); +	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); +	ss->root = &cgroup_dummy_root; +	css = ss->css_alloc(cgroup_dummy_top);  	/* We don't handle early failures gracefully */  	BUG_ON(IS_ERR(css)); -	init_cgroup_css(css, ss, dummytop); +	init_cgroup_css(css, ss, cgroup_dummy_top);  	/* Update the init_css_set to contain a subsys  	 * pointer to this state - since the subsystem is @@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  	 * need to invoke fork callbacks here. */  	BUG_ON(!list_empty(&init_task.tasks)); -	BUG_ON(online_css(ss, dummytop)); +	BUG_ON(online_css(ss, cgroup_dummy_top));  	mutex_unlock(&cgroup_mutex); @@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	struct cgroup_subsys_state *css;  	int i, ret;  	struct hlist_node *tmp; -	struct css_set *cg; +	struct css_set *cset;  	unsigned long key;  	/* check name and function validity */ @@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	 */  	if (ss->module == NULL) {  		/* a sanity check */ -		BUG_ON(subsys[ss->subsys_id] != ss); +		BUG_ON(cgroup_subsys[ss->subsys_id] != ss);  		return 0;  	} @@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	cgroup_init_cftsets(ss);  	mutex_lock(&cgroup_mutex); -	subsys[ss->subsys_id] = ss; +	cgroup_subsys[ss->subsys_id] = ss;  	/*  	 * no ss->css_alloc seems to need anything important in the ss -	 * struct, so this can happen first (i.e. before the rootnode +	 * struct, so this can happen first (i.e. before the dummy root  	 * attachment).  	 */ -	css = ss->css_alloc(dummytop); +	css = ss->css_alloc(cgroup_dummy_top);  	if (IS_ERR(css)) { -		/* failure case - need to deassign the subsys[] slot. */ -		subsys[ss->subsys_id] = NULL; +		/* failure case - need to deassign the cgroup_subsys[] slot. */ +		cgroup_subsys[ss->subsys_id] = NULL;  		mutex_unlock(&cgroup_mutex);  		return PTR_ERR(css);  	} -	list_add(&ss->sibling, &rootnode.subsys_list); -	ss->root = &rootnode; +	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); +	ss->root = &cgroup_dummy_root;  	/* our new subsystem will be attached to the dummy hierarchy. */ -	init_cgroup_css(css, ss, dummytop); +	init_cgroup_css(css, ss, cgroup_dummy_top);  	/* init_idr must be after init_cgroup_css because it sets css->id. */  	if (ss->use_id) {  		ret = cgroup_init_idr(ss, css); @@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	 * this is all done under the css_set_lock.  	 */  	write_lock(&css_set_lock); -	hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { +	hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {  		/* skip entries that we already rehashed */ -		if (cg->subsys[ss->subsys_id]) +		if (cset->subsys[ss->subsys_id])  			continue;  		/* remove existing entry */ -		hash_del(&cg->hlist); +		hash_del(&cset->hlist);  		/* set new value */ -		cg->subsys[ss->subsys_id] = css; +		cset->subsys[ss->subsys_id] = css;  		/* recompute hash and restore entry */ -		key = css_set_hash(cg->subsys); -		hash_add(css_set_table, &cg->hlist, key); +		key = css_set_hash(cset->subsys); +		hash_add(css_set_table, &cset->hlist, key);  	}  	write_unlock(&css_set_lock); -	ret = online_css(ss, dummytop); +	ret = online_css(ss, cgroup_dummy_top);  	if (ret)  		goto err_unload; @@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);   */  void cgroup_unload_subsys(struct cgroup_subsys *ss)  { -	struct cg_cgroup_link *link; +	struct cgrp_cset_link *link;  	BUG_ON(ss->module == NULL); @@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	 * try_module_get in parse_cgroupfs_options should ensure that it  	 * doesn't start being used while we're killing it off.  	 */ -	BUG_ON(ss->root != &rootnode); +	BUG_ON(ss->root != &cgroup_dummy_root);  	mutex_lock(&cgroup_mutex); -	offline_css(ss, dummytop); +	offline_css(ss, cgroup_dummy_top);  	if (ss->use_id)  		idr_destroy(&ss->idr);  	/* deassign the subsys_id */ -	subsys[ss->subsys_id] = NULL; +	cgroup_subsys[ss->subsys_id] = NULL; -	/* remove subsystem from rootnode's list of subsystems */ +	/* remove subsystem from the dummy root's list of subsystems */  	list_del_init(&ss->sibling);  	/* -	 * disentangle the css from all css_sets attached to the dummytop. as -	 * in loading, we need to pay our respects to the hashtable gods. +	 * disentangle the css from all css_sets attached to the dummy +	 * top. as in loading, we need to pay our respects to the hashtable +	 * gods.  	 */  	write_lock(&css_set_lock); -	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { -		struct css_set *cg = link->cg; +	list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { +		struct css_set *cset = link->cset;  		unsigned long key; -		hash_del(&cg->hlist); -		cg->subsys[ss->subsys_id] = NULL; -		key = css_set_hash(cg->subsys); -		hash_add(css_set_table, &cg->hlist, key); +		hash_del(&cset->hlist); +		cset->subsys[ss->subsys_id] = NULL; +		key = css_set_hash(cset->subsys); +		hash_add(css_set_table, &cset->hlist, key);  	}  	write_unlock(&css_set_lock);  	/* -	 * remove subsystem's css from the dummytop and free it - need to -	 * free before marking as null because ss->css_free needs the -	 * cgrp->subsys pointer to find their state. note that this also -	 * takes care of freeing the css_id. +	 * remove subsystem's css from the cgroup_dummy_top and free it - +	 * need to free before marking as null because ss->css_free needs +	 * the cgrp->subsys pointer to find their state. note that this +	 * also takes care of freeing the css_id.  	 */ -	ss->css_free(dummytop); -	dummytop->subsys[ss->subsys_id] = NULL; +	ss->css_free(cgroup_dummy_top); +	cgroup_dummy_top->subsys[ss->subsys_id] = NULL;  	mutex_unlock(&cgroup_mutex);  } @@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);   */  int __init cgroup_init_early(void)  { +	struct cgroup_subsys *ss;  	int i; +  	atomic_set(&init_css_set.refcount, 1); -	INIT_LIST_HEAD(&init_css_set.cg_links); +	INIT_LIST_HEAD(&init_css_set.cgrp_links);  	INIT_LIST_HEAD(&init_css_set.tasks);  	INIT_HLIST_NODE(&init_css_set.hlist);  	css_set_count = 1; -	init_cgroup_root(&rootnode); -	root_count = 1; -	init_task.cgroups = &init_css_set; - -	init_css_set_link.cg = &init_css_set; -	init_css_set_link.cgrp = dummytop; -	list_add(&init_css_set_link.cgrp_link_list, -		 &rootnode.top_cgroup.css_sets); -	list_add(&init_css_set_link.cg_link_list, -		 &init_css_set.cg_links); - -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		struct cgroup_subsys *ss = subsys[i]; - -		/* at bootup time, we don't worry about modular subsystems */ -		if (!ss || ss->module) -			continue; +	init_cgroup_root(&cgroup_dummy_root); +	cgroup_root_count = 1; +	RCU_INIT_POINTER(init_task.cgroups, &init_css_set); + +	init_cgrp_cset_link.cset = &init_css_set; +	init_cgrp_cset_link.cgrp = cgroup_dummy_top; +	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); +	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); +	/* at bootup time, we don't worry about modular subsystems */ +	for_each_builtin_subsys(ss, i) {  		BUG_ON(!ss->name);  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);  		BUG_ON(!ss->css_alloc); @@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)   */  int __init cgroup_init(void)  { -	int err; -	int i; +	struct cgroup_subsys *ss;  	unsigned long key; +	int i, err;  	err = bdi_init(&cgroup_backing_dev_info);  	if (err)  		return err; -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		struct cgroup_subsys *ss = subsys[i]; - -		/* at bootup time, we don't worry about modular subsystems */ -		if (!ss || ss->module) -			continue; +	for_each_builtin_subsys(ss, i) {  		if (!ss->early_init)  			cgroup_init_subsys(ss);  		if (ss->use_id)  			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);  	} +	/* allocate id for the dummy hierarchy */ +	mutex_lock(&cgroup_mutex); +	mutex_lock(&cgroup_root_mutex); +  	/* Add init_css_set to the hash table */  	key = css_set_hash(init_css_set.subsys);  	hash_add(css_set_table, &init_css_set.hlist, key); -	BUG_ON(!init_root_id(&rootnode)); + +	BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + +	mutex_unlock(&cgroup_root_mutex); +	mutex_unlock(&cgroup_mutex);  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);  	if (!cgroup_kobj) { @@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)  		int count = 0;  		seq_printf(m, "%d:", root->hierarchy_id); -		for_each_subsys(root, ss) +		for_each_root_subsys(root, ss)  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);  		if (strlen(root->name))  			seq_printf(m, "%sname=%s", count ? "," : "", @@ -4734,6 +5018,7 @@ out:  /* Display information about each subsystem and each hierarchy */  static int proc_cgroupstats_show(struct seq_file *m, void *v)  { +	struct cgroup_subsys *ss;  	int i;  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)  	 * subsys/hierarchy state.  	 */  	mutex_lock(&cgroup_mutex); -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		struct cgroup_subsys *ss = subsys[i]; -		if (ss == NULL) -			continue; + +	for_each_subsys(ss, i)  		seq_printf(m, "%s\t%d\t%d\t%d\n",  			   ss->name, ss->root->hierarchy_id,  			   ss->root->number_of_cgroups, !ss->disabled); -	} +  	mutex_unlock(&cgroup_mutex);  	return 0;  } @@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {  void cgroup_fork(struct task_struct *child)  {  	task_lock(current); +	get_css_set(task_css_set(current));  	child->cgroups = current->cgroups; -	get_css_set(child->cgroups);  	task_unlock(current);  	INIT_LIST_HEAD(&child->cg_list);  } @@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)   */  void cgroup_post_fork(struct task_struct *child)  { +	struct cgroup_subsys *ss;  	int i;  	/* @@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)  		write_lock(&css_set_lock);  		task_lock(child);  		if (list_empty(&child->cg_list)) -			list_add(&child->cg_list, &child->cgroups->tasks); +			list_add(&child->cg_list, &task_css_set(child)->tasks);  		task_unlock(child);  		write_unlock(&css_set_lock);  	} @@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)  		 * of the array can be freed at module unload, so we  		 * can't touch that.  		 */ -		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { -			struct cgroup_subsys *ss = subsys[i]; - +		for_each_builtin_subsys(ss, i)  			if (ss->fork)  				ss->fork(child); -		}  	}  } @@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)   */  void cgroup_exit(struct task_struct *tsk, int run_callbacks)  { -	struct css_set *cg; +	struct cgroup_subsys *ss; +	struct css_set *cset;  	int i;  	/* @@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  	/* Reassign the task to the init_css_set. */  	task_lock(tsk); -	cg = tsk->cgroups; -	tsk->cgroups = &init_css_set; +	cset = task_css_set(tsk); +	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);  	if (run_callbacks && need_forkexit_callback) {  		/*  		 * fork/exit callbacks are supported only for builtin  		 * subsystems, see cgroup_post_fork() for details.  		 */ -		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { -			struct cgroup_subsys *ss = subsys[i]; - +		for_each_builtin_subsys(ss, i) {  			if (ss->exit) { -				struct cgroup *old_cgrp = -					rcu_dereference_raw(cg->subsys[i])->cgroup; +				struct cgroup *old_cgrp = cset->subsys[i]->cgroup;  				struct cgroup *cgrp = task_cgroup(tsk, i); +  				ss->exit(cgrp, old_cgrp, tsk);  			}  		}  	}  	task_unlock(tsk); -	put_css_set_taskexit(cg); +	put_css_set_taskexit(cset);  }  static void check_for_release(struct cgroup *cgrp)  { -	/* All of these checks rely on RCU to keep the cgroup -	 * structure alive */  	if (cgroup_is_releasable(cgrp) && -	    !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { +	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {  		/*  		 * Control Group is currently removeable. If it's not  		 * already queued for a userspace notification, queue @@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)  		int need_schedule_work = 0;  		raw_spin_lock(&release_list_lock); -		if (!cgroup_is_removed(cgrp) && +		if (!cgroup_is_dead(cgrp) &&  		    list_empty(&cgrp->release_list)) {  			list_add(&cgrp->release_list, &release_list);  			need_schedule_work = 1; @@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)  	}  } -/* Caller must verify that the css is not for root cgroup */ -bool __css_tryget(struct cgroup_subsys_state *css) -{ -	while (true) { -		int t, v; - -		v = css_refcnt(css); -		t = atomic_cmpxchg(&css->refcnt, v, v + 1); -		if (likely(t == v)) -			return true; -		else if (t < 0) -			return false; -		cpu_relax(); -	} -} -EXPORT_SYMBOL_GPL(__css_tryget); - -/* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css) -{ -	int v; - -	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); -	if (v == 0) -		schedule_work(&css->dput_work); -} -EXPORT_SYMBOL_GPL(__css_put); -  /*   * Notify userspace when a cgroup is released, by running the   * configured release agent with the name of the cgroup (path @@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)  static int __init cgroup_disable(char *str)  { -	int i; +	struct cgroup_subsys *ss;  	char *token; +	int i;  	while ((token = strsep(&str, ",")) != NULL) {  		if (!*token)  			continue; -		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -			struct cgroup_subsys *ss = subsys[i]; - -			/* -			 * cgroup_disable, being at boot time, can't -			 * know about module subsystems, so we don't -			 * worry about them. -			 */ -			if (!ss || ss->module) -				continue; +		/* +		 * cgroup_disable, being at boot time, can't know about +		 * module subsystems, so we don't worry about them. +		 */ +		for_each_builtin_subsys(ss, i) {  			if (!strcmp(token, ss->name)) {  				ss->disabled = 1;  				printk(KERN_INFO "Disabling %s control group" @@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);   * Functons for CSS ID.   */ -/* - *To get ID other than 0, this should be called when !cgroup_is_removed(). - */ +/* to get ID other than 0, this should be called when !cgroup_is_dead() */  unsigned short css_id(struct cgroup_subsys_state *css)  {  	struct css_id *cssid; @@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)  	 * on this or this is under rcu_read_lock(). Once css->id is allocated,  	 * it's unchanged until freed.  	 */ -	cssid = rcu_dereference_check(css->id, css_refcnt(css)); +	cssid = rcu_dereference_raw(css->id);  	if (cssid)  		return cssid->id; @@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)  }  EXPORT_SYMBOL_GPL(css_id); -unsigned short css_depth(struct cgroup_subsys_state *css) -{ -	struct css_id *cssid; - -	cssid = rcu_dereference_check(css->id, css_refcnt(css)); - -	if (cssid) -		return cssid->depth; -	return 0; -} -EXPORT_SYMBOL_GPL(css_depth); -  /**   *  css_is_ancestor - test "root" css is an ancestor of "child"   * @child: the css to be tested. @@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)  { -	struct css_id *id = css->id; +	struct css_id *id = rcu_dereference_protected(css->id, true); +  	/* When this is called before css_id initialization, id can be NULL */  	if (!id)  		return; @@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,  		return PTR_ERR(newid);  	newid->stack[0] = newid->id; -	newid->css = rootcss; -	rootcss->id = newid; +	RCU_INIT_POINTER(newid->css, rootcss); +	RCU_INIT_POINTER(rootcss->id, newid);  	return 0;  } @@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,  	subsys_id = ss->subsys_id;  	parent_css = parent->subsys[subsys_id];  	child_css = child->subsys[subsys_id]; -	parent_id = parent_css->id; +	parent_id = rcu_dereference_protected(parent_css->id, true);  	depth = parent_id->depth + 1;  	child_id = get_new_cssid(ss, depth); @@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)  }  #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)  {  	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)  	return css;  } -static void debug_css_free(struct cgroup *cont) -{ -	kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +static void debug_css_free(struct cgroup *cgrp)  { -	return atomic_read(&cont->count); +	kfree(cgrp->subsys[debug_subsys_id]);  } -static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)  { -	return cgroup_task_count(cont); +	return cgroup_task_count(cgrp);  } -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)  {  	return (u64)(unsigned long)current->cgroups;  } -static u64 current_css_set_refcount_read(struct cgroup *cont, -					   struct cftype *cft) +static u64 current_css_set_refcount_read(struct cgroup *cgrp, +					 struct cftype *cft)  {  	u64 count;  	rcu_read_lock(); -	count = atomic_read(¤t->cgroups->refcount); +	count = atomic_read(&task_css_set(current)->refcount);  	rcu_read_unlock();  	return count;  } -static int current_css_set_cg_links_read(struct cgroup *cont, +static int current_css_set_cg_links_read(struct cgroup *cgrp,  					 struct cftype *cft,  					 struct seq_file *seq)  { -	struct cg_cgroup_link *link; -	struct css_set *cg; +	struct cgrp_cset_link *link; +	struct css_set *cset;  	read_lock(&css_set_lock);  	rcu_read_lock(); -	cg = rcu_dereference(current->cgroups); -	list_for_each_entry(link, &cg->cg_links, cg_link_list) { +	cset = rcu_dereference(current->cgroups); +	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {  		struct cgroup *c = link->cgrp;  		const char *name; @@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,  }  #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cont, +static int cgroup_css_links_read(struct cgroup *cgrp,  				 struct cftype *cft,  				 struct seq_file *seq)  { -	struct cg_cgroup_link *link; +	struct cgrp_cset_link *link;  	read_lock(&css_set_lock); -	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { -		struct css_set *cg = link->cg; +	list_for_each_entry(link, &cgrp->cset_links, cset_link) { +		struct css_set *cset = link->cset;  		struct task_struct *task;  		int count = 0; -		seq_printf(seq, "css_set %p\n", cg); -		list_for_each_entry(task, &cg->tasks, cg_list) { +		seq_printf(seq, "css_set %p\n", cset); +		list_for_each_entry(task, &cset->tasks, cg_list) {  			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {  				seq_puts(seq, "  ...\n");  				break; @@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)  static struct cftype debug_files[] =  {  	{ -		.name = "cgroup_refcount", -		.read_u64 = cgroup_refcount_read, -	}, -	{  		.name = "taskcount",  		.read_u64 = debug_taskcount_read,  	}, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..383f8231e436 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@   */  #include <linux/context_tracking.h> -#include <linux/kvm_host.h>  #include <linux/rcupdate.h>  #include <linux/sched.h>  #include <linux/hardirq.h> @@ -71,6 +70,46 @@ void user_enter(void)  	local_irq_restore(flags);  } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ +	struct thread_info *ti = current_thread_info(); +	enum ctx_state prev_ctx; + +	if (likely(ti->preempt_count || irqs_disabled())) +		return; + +	/* +	 * Need to disable preemption in case user_exit() is traced +	 * and the tracer calls preempt_enable_notrace() causing +	 * an infinite recursion. +	 */ +	preempt_disable_notrace(); +	prev_ctx = exception_enter(); +	preempt_enable_no_resched_notrace(); + +	preempt_schedule(); + +	preempt_disable_notrace(); +	exception_exit(prev_ctx); +	preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */  /**   * user_exit - Inform the context tracking that the CPU is diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038e..e695c0a0bcb5 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@  #include <linux/cpu.h>  #include <linux/tick.h>  #include <linux/mm.h> +#include <linux/stackprotector.h>  #include <asm/tlb.h> @@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }  void __weak arch_cpu_idle(void)  {  	cpu_idle_force_poll = 1; +	local_irq_enable();  }  /* @@ -112,6 +114,21 @@ static void cpu_idle_loop(void)  void cpu_startup_entry(enum cpuhp_state state)  { +	/* +	 * This #ifdef needs to die, but it's too late in the cycle to +	 * make this generic (arm and sh have never invoked the canary +	 * init for the non boot cpus!). Will be fixed in 3.11 +	 */ +#ifdef CONFIG_X86 +	/* +	 * If we're the non-boot CPU, nothing set the stack canary up +	 * for us. The boot CPU already has it initialized but no harm +	 * in doing it again. This is a good place for updating it, as +	 * we wont ever return from this function (so the invalid +	 * canaries already on the stack wont ever trigger). +	 */ +	boot_init_stack_canary(); +#endif  	current_set_polling();  	arch_cpu_idle_prepare();  	cpu_idle_loop(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..e5657788fedd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -59,6 +59,7 @@  #include <linux/mutex.h>  #include <linux/workqueue.h>  #include <linux/cgroup.h> +#include <linux/wait.h>  /*   * Tracks how many cpusets are currently defined in system. @@ -87,6 +88,18 @@ struct cpuset {  	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */  	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */ +	/* +	 * This is old Memory Nodes tasks took on. +	 * +	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed. +	 * - A new cpuset's old_mems_allowed is initialized when some +	 *   task is moved into it. +	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change +	 *   cpuset.mems_allowed and have tasks' nodemask updated, and +	 *   then old_mems_allowed is updated to mems_allowed. +	 */ +	nodemask_t old_mems_allowed; +  	struct fmeter fmeter;		/* memory_pressure filter */  	/* @@ -100,14 +113,12 @@ struct cpuset {  	/* for custom sched domain */  	int relax_domain_level; - -	struct work_struct hotplug_work;  };  /* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cont) +static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)  { -	return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), +	return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),  			    struct cpuset, css);  } @@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);  /*   * CPU / memory hotplug is handled asynchronously.   */ -static struct workqueue_struct *cpuset_propagate_hotplug_wq; -  static void cpuset_hotplug_workfn(struct work_struct *work); -static void cpuset_propagate_hotplug_workfn(struct work_struct *work); -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); -  static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); +  /*   * This is ugly, but preserves the userspace API for existing cpuset   * users. If someone tries to mount the "cpuset" filesystem, we @@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {  /*   * Return in pmask the portion of a cpusets's cpus_allowed that   * are online.  If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus.  If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_mask.  Or if passed a NULL cs from an exit'ing - * task, return cpu_online_mask. + * until we find one that does have some online cpus.  The top + * cpuset always has some cpus online.   *   * One way or another, we guarantee to return some non-empty subset   * of cpu_online_mask.   *   * Call with callback_mutex held.   */ -  static void guarantee_online_cpus(const struct cpuset *cs,  				  struct cpumask *pmask)  { -	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) +	while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))  		cs = parent_cs(cs); -	if (cs) -		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); -	else -		cpumask_copy(pmask, cpu_online_mask); -	BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); +	cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);  }  /*   * Return in *pmask the portion of a cpusets's mems_allowed that   * are online, with memory.  If none are online with memory, walk   * up the cpuset hierarchy until we find one that does have some - * online mems.  If we get all the way to the top and still haven't - * found any online mems, return node_states[N_MEMORY]. + * online mems.  The top cpuset always has some mems online.   *   * One way or another, we guarantee to return some non-empty subset   * of node_states[N_MEMORY].   *   * Call with callback_mutex held.   */ -  static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)  { -	while (cs && !nodes_intersects(cs->mems_allowed, -					node_states[N_MEMORY])) +	while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))  		cs = parent_cs(cs); -	if (cs) -		nodes_and(*pmask, cs->mems_allowed, -					node_states[N_MEMORY]); -	else -		*pmask = node_states[N_MEMORY]; -	BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); +	nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);  }  /* @@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)  static int validate_change(const struct cpuset *cur, const struct cpuset *trial)  { -	struct cgroup *cont; +	struct cgroup *cgrp;  	struct cpuset *c, *par;  	int ret; @@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)  	/* Each of our child cpusets must be a subset of us */  	ret = -EBUSY; -	cpuset_for_each_child(c, cont, cur) +	cpuset_for_each_child(c, cgrp, cur)  		if (!is_cpuset_subset(c, trial))  			goto out; @@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)  	 * overlap  	 */  	ret = -EINVAL; -	cpuset_for_each_child(c, cont, par) { +	cpuset_for_each_child(c, cgrp, par) {  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&  		    c != cur &&  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)  	 */  	ret = -ENOSPC;  	if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && -	    (cpumask_empty(trial->cpus_allowed) || +	    (cpumask_empty(trial->cpus_allowed) &&  	     nodes_empty(trial->mems_allowed)))  		goto out; @@ -540,7 +533,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,   * This function builds a partial partition of the systems CPUs   * A 'partial partition' is a set of non-overlapping subsets whose   * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c + * The output of this function needs to be passed to kernel/sched/core.c   * partition_sched_domains() routine, which will rebuild the scheduler's   * load balancing domains (sched domains) as specified by that partial   * partition. @@ -569,7 +562,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,   *	   is a subset of one of these domains, while there are as   *	   many such domains as possible, each as small as possible.   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to - *	   the kernel/sched.c routine partition_sched_domains() in a + *	   the kernel/sched/core.c routine partition_sched_domains() in a   *	   convenient format, that can be easily compared to the prior   *	   value to determine what partition elements (sched domains)   *	   were changed (added or removed.) @@ -798,21 +791,43 @@ void rebuild_sched_domains(void)  	mutex_unlock(&cpuset_mutex);  } -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner +/* + * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus + * @cs: the cpuset in interest   * - * Call with cpuset_mutex held.  May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). + * A cpuset's effective cpumask is the cpumask of the nearest ancestor + * with non-empty cpus. We use effective cpumask whenever: + * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask + *   if the cpuset they reside in has no cpus) + * - we want to retrieve task_cs(tsk)'s cpus_allowed. + * + * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an + * exception. See comments there.   */ -static int cpuset_test_cpumask(struct task_struct *tsk, -			       struct cgroup_scanner *scan) +static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)  { -	return !cpumask_equal(&tsk->cpus_allowed, -			(cgroup_cs(scan->cg))->cpus_allowed); +	while (cpumask_empty(cs->cpus_allowed)) +		cs = parent_cs(cs); +	return cs; +} + +/* + * effective_nodemask_cpuset - return nearest ancestor with non-empty mems + * @cs: the cpuset in interest + * + * A cpuset's effective nodemask is the nodemask of the nearest ancestor + * with non-empty memss. We use effective nodemask whenever: + * - we update tasks' mems_allowed. (they take on the ancestor's nodemask + *   if the cpuset they reside in has no mems) + * - we want to retrieve task_cs(tsk)'s mems_allowed. + * + * Called with cpuset_mutex held. + */ +static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) +{ +	while (nodes_empty(cs->mems_allowed)) +		cs = parent_cs(cs); +	return cs;  }  /** @@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,  static void cpuset_change_cpumask(struct task_struct *tsk,  				  struct cgroup_scanner *scan)  { -	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); +	struct cpuset *cpus_cs; + +	cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); +	set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);  }  /** @@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)  	struct cgroup_scanner scan;  	scan.cg = cs->css.cgroup; -	scan.test_task = cpuset_test_cpumask; +	scan.test_task = NULL;  	scan.process_task = cpuset_change_cpumask;  	scan.heap = heap;  	cgroup_scan_tasks(&scan);  } +/* + * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. + * @root_cs: the root cpuset of the hierarchy + * @update_root: update root cpuset or not? + * @heap: the heap used by cgroup_scan_tasks() + * + * This will update cpumasks of tasks in @root_cs and all other empty cpusets + * which take on cpumask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_cpumask_hier(struct cpuset *root_cs, +				      bool update_root, struct ptr_heap *heap) +{ +	struct cpuset *cp; +	struct cgroup *pos_cgrp; + +	if (update_root) +		update_tasks_cpumask(root_cs, heap); + +	rcu_read_lock(); +	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { +		/* skip the whole subtree if @cp have some CPU */ +		if (!cpumask_empty(cp->cpus_allowed)) { +			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); +			continue; +		} +		if (!css_tryget(&cp->css)) +			continue; +		rcu_read_unlock(); + +		update_tasks_cpumask(cp, heap); + +		rcu_read_lock(); +		css_put(&cp->css); +	} +	rcu_read_unlock(); +} +  /**   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it   * @cs: the cpuset to consider @@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))  			return -EINVAL;  	} -	retval = validate_change(cs, trialcs); -	if (retval < 0) -		return retval;  	/* Nothing to do if the cpus didn't change */  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))  		return 0; +	retval = validate_change(cs, trialcs); +	if (retval < 0) +		return retval; +  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);  	if (retval)  		return retval; @@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);  	mutex_unlock(&callback_mutex); -	/* -	 * Scan tasks in the cpuset, and update the cpumasks of any -	 * that need an update. -	 */ -	update_tasks_cpumask(cs, &heap); +	update_tasks_cpumask_hier(cs, true, &heap);  	heap_free(&heap); @@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,  							const nodemask_t *to)  {  	struct task_struct *tsk = current; +	struct cpuset *mems_cs;  	tsk->mems_allowed = *to;  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); -	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); +	mems_cs = effective_nodemask_cpuset(task_cs(tsk)); +	guarantee_online_mems(mems_cs, &tsk->mems_allowed);  }  /* @@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,  static void cpuset_change_nodemask(struct task_struct *p,  				   struct cgroup_scanner *scan)  { +	struct cpuset *cs = cgroup_cs(scan->cg);  	struct mm_struct *mm; -	struct cpuset *cs;  	int migrate; -	const nodemask_t *oldmem = scan->data; -	static nodemask_t newmems;	/* protected by cpuset_mutex */ - -	cs = cgroup_cs(scan->cg); -	guarantee_online_mems(cs, &newmems); +	nodemask_t *newmems = scan->data; -	cpuset_change_task_nodemask(p, &newmems); +	cpuset_change_task_nodemask(p, newmems);  	mm = get_task_mm(p);  	if (!mm) @@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p,  	mpol_rebind_mm(mm, &cs->mems_allowed);  	if (migrate) -		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); +		cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);  	mmput(mm);  } @@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound;  /**   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @oldmem: old mems_allowed of cpuset cs   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()   *   * Called with cpuset_mutex held   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0   * if @heap != NULL.   */ -static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, -				 struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)  { +	static nodemask_t newmems;	/* protected by cpuset_mutex */  	struct cgroup_scanner scan; +	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */ +	guarantee_online_mems(mems_cs, &newmems); +  	scan.cg = cs->css.cgroup;  	scan.test_task = NULL;  	scan.process_task = cpuset_change_nodemask;  	scan.heap = heap; -	scan.data = (nodemask_t *)oldmem; +	scan.data = &newmems;  	/*  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,  	 */  	cgroup_scan_tasks(&scan); +	/* +	 * All the tasks' nodemasks have been updated, update +	 * cs->old_mems_allowed. +	 */ +	cs->old_mems_allowed = newmems; +  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */  	cpuset_being_rebound = NULL;  }  /* + * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. + * @cs: the root cpuset of the hierarchy + * @update_root: update the root cpuset or not? + * @heap: the heap used by cgroup_scan_tasks() + * + * This will update nodemasks of tasks in @root_cs and all other empty cpusets + * which take on nodemask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_nodemask_hier(struct cpuset *root_cs, +				       bool update_root, struct ptr_heap *heap) +{ +	struct cpuset *cp; +	struct cgroup *pos_cgrp; + +	if (update_root) +		update_tasks_nodemask(root_cs, heap); + +	rcu_read_lock(); +	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { +		/* skip the whole subtree if @cp have some CPU */ +		if (!nodes_empty(cp->mems_allowed)) { +			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); +			continue; +		} +		if (!css_tryget(&cp->css)) +			continue; +		rcu_read_unlock(); + +		update_tasks_nodemask(cp, heap); + +		rcu_read_lock(); +		css_put(&cp->css); +	} +	rcu_read_unlock(); +} + +/*   * Handle user request to change the 'mems' memory placement   * of a cpuset.  Needs to validate the request, update the   * cpusets mems_allowed, and for each task in the cpuset, @@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  			   const char *buf)  { -	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);  	int retval;  	struct ptr_heap heap; -	if (!oldmem) -		return -ENOMEM; -  	/*  	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];  	 * it's read-only @@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  			goto done;  		}  	} -	*oldmem = cs->mems_allowed; -	if (nodes_equal(*oldmem, trialcs->mems_allowed)) { + +	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {  		retval = 0;		/* Too easy - nothing to do */  		goto done;  	} @@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  	cs->mems_allowed = trialcs->mems_allowed;  	mutex_unlock(&callback_mutex); -	update_tasks_nodemask(cs, oldmem, &heap); +	update_tasks_nodemask_hier(cs, true, &heap);  	heap_free(&heap);  done: -	NODEMASK_FREE(oldmem);  	return retval;  } @@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  	mutex_lock(&cpuset_mutex); +	/* +	 * We allow to move tasks into an empty cpuset if sane_behavior +	 * flag is set. +	 */  	ret = -ENOSPC; -	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) +	if (!cgroup_sane_behavior(cgrp) && +	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))  		goto out_unlock;  	cgroup_taskset_for_each(task, cgrp, tset) { @@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach;  static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  { -	/* static bufs protected by cpuset_mutex */ -	static nodemask_t cpuset_attach_nodemask_from; +	/* static buf protected by cpuset_mutex */  	static nodemask_t cpuset_attach_nodemask_to;  	struct mm_struct *mm;  	struct task_struct *task; @@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);  	struct cpuset *cs = cgroup_cs(cgrp);  	struct cpuset *oldcs = cgroup_cs(oldcgrp); +	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); +	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);  	mutex_lock(&cpuset_mutex); @@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  	if (cs == &top_cpuset)  		cpumask_copy(cpus_attach, cpu_possible_mask);  	else -		guarantee_online_cpus(cs, cpus_attach); +		guarantee_online_cpus(cpus_cs, cpus_attach); -	guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +	guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);  	cgroup_taskset_for_each(task, cgrp, tset) {  		/* @@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  	 * Change mm, possibly for multiple threads in a threadgroup. This is  	 * expensive and may sleep.  	 */ -	cpuset_attach_nodemask_from = oldcs->mems_allowed;  	cpuset_attach_nodemask_to = cs->mems_allowed;  	mm = get_task_mm(leader);  	if (mm) { +		struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); +  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); -		if (is_memory_migrate(cs)) -			cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + +		/* +		 * old_mems_allowed is the same with mems_allowed here, except +		 * if this task is being moved automatically due to hotplug. +		 * In that case @mems_allowed has been updated and is empty, +		 * so @old_mems_allowed is the right nodesets that we migrate +		 * mm from. +		 */ +		if (is_memory_migrate(cs)) { +			cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,  					  &cpuset_attach_nodemask_to); +		}  		mmput(mm);  	} -	cs->attach_in_progress--; +	cs->old_mems_allowed = cpuset_attach_nodemask_to; -	/* -	 * We may have raced with CPU/memory hotunplug.  Trigger hotplug -	 * propagation if @cs doesn't have any CPU or memory.  It will move -	 * the newly added tasks to the nearest parent which can execute. -	 */ -	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) -		schedule_cpuset_propagate_hotplug(cs); +	cs->attach_in_progress--; +	if (!cs->attach_in_progress) +		wake_up(&cpuset_attach_wq);  	mutex_unlock(&cpuset_mutex);  } @@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,  	 * resources, wait for the previously scheduled operations before  	 * proceeding, so that we don't end up keep removing tasks added  	 * after execution capability is restored. -	 * -	 * Flushing cpuset_hotplug_work is enough to synchronize against -	 * hotplug hanlding; however, cpuset_attach() may schedule -	 * propagation work directly.  Flush the workqueue too.  	 */  	flush_work(&cpuset_hotplug_work); -	flush_workqueue(cpuset_propagate_hotplug_wq);  	mutex_lock(&cpuset_mutex);  	if (!is_cpuset_online(cs)) @@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)  	return count;  } -static ssize_t cpuset_common_file_read(struct cgroup *cont, +static ssize_t cpuset_common_file_read(struct cgroup *cgrp,  				       struct cftype *cft,  				       struct file *file,  				       char __user *buf,  				       size_t nbytes, loff_t *ppos)  { -	struct cpuset *cs = cgroup_cs(cont); +	struct cpuset *cs = cgroup_cs(cgrp);  	cpuset_filetype_t type = cft->private;  	char *page;  	ssize_t retval = 0; @@ -1694,9 +1795,9 @@ out:  	return retval;  } -static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)  { -	struct cpuset *cs = cgroup_cs(cont); +	struct cpuset *cs = cgroup_cs(cgrp);  	cpuset_filetype_t type = cft->private;  	switch (type) {  	case FILE_CPU_EXCLUSIVE: @@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)  	return 0;  } -static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)  { -	struct cpuset *cs = cgroup_cs(cont); +	struct cpuset *cs = cgroup_cs(cgrp);  	cpuset_filetype_t type = cft->private;  	switch (type) {  	case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1839,14 +1940,14 @@ static struct cftype files[] = {  /*   *	cpuset_css_alloc - allocate a cpuset css - *	cont:	control group that the new cpuset will be part of + *	cgrp:	control group that the new cpuset will be part of   */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)  {  	struct cpuset *cs; -	if (!cont->parent) +	if (!cgrp->parent)  		return &top_cpuset.css;  	cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)  	cpumask_clear(cs->cpus_allowed);  	nodes_clear(cs->mems_allowed);  	fmeter_init(&cs->fmeter); -	INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);  	cs->relax_domain_level = -1;  	return &cs->css; @@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)   * will call rebuild_sched_domains_locked().   */ -static void cpuset_css_free(struct cgroup *cont) +static void cpuset_css_free(struct cgroup *cgrp)  { -	struct cpuset *cs = cgroup_cs(cont); +	struct cpuset *cs = cgroup_cs(cgrp);  	free_cpumask_var(cs->cpus_allowed);  	kfree(cs); @@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  }  /** - * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug   * @cs: cpuset in interest   *   * Compare @cs's cpu and mem masks against top_cpuset and if some have gone   * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,   * all its tasks are moved to the nearest ancestor with both resources.   */ -static void cpuset_propagate_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug_update_tasks(struct cpuset *cs)  {  	static cpumask_t off_cpus; -	static nodemask_t off_mems, tmp_mems; -	struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); +	static nodemask_t off_mems;  	bool is_empty; +	bool sane = cgroup_sane_behavior(cs->css.cgroup); + +retry: +	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);  	mutex_lock(&cpuset_mutex); +	/* +	 * We have raced with task attaching. We wait until attaching +	 * is finished, so we won't attach a task to an empty cpuset. +	 */ +	if (cs->attach_in_progress) { +		mutex_unlock(&cpuset_mutex); +		goto retry; +	} +  	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);  	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); -	/* remove offline cpus from @cs */ -	if (!cpumask_empty(&off_cpus)) { -		mutex_lock(&callback_mutex); -		cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); -		mutex_unlock(&callback_mutex); +	mutex_lock(&callback_mutex); +	cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); +	mutex_unlock(&callback_mutex); + +	/* +	 * If sane_behavior flag is set, we need to update tasks' cpumask +	 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't +	 * call update_tasks_cpumask() if the cpuset becomes empty, as +	 * the tasks in it will be migrated to an ancestor. +	 */ +	if ((sane && cpumask_empty(cs->cpus_allowed)) || +	    (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))  		update_tasks_cpumask(cs, NULL); -	} -	/* remove offline mems from @cs */ -	if (!nodes_empty(off_mems)) { -		tmp_mems = cs->mems_allowed; -		mutex_lock(&callback_mutex); -		nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); -		mutex_unlock(&callback_mutex); -		update_tasks_nodemask(cs, &tmp_mems, NULL); -	} +	mutex_lock(&callback_mutex); +	nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); +	mutex_unlock(&callback_mutex); + +	/* +	 * If sane_behavior flag is set, we need to update tasks' nodemask +	 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't +	 * call update_tasks_nodemask() if the cpuset becomes empty, as +	 * the tasks in it will be migratd to an ancestor. +	 */ +	if ((sane && nodes_empty(cs->mems_allowed)) || +	    (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) +		update_tasks_nodemask(cs, NULL);  	is_empty = cpumask_empty(cs->cpus_allowed) ||  		nodes_empty(cs->mems_allowed); @@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)  	mutex_unlock(&cpuset_mutex);  	/* -	 * If @cs became empty, move tasks to the nearest ancestor with -	 * execution resources.  This is full cgroup operation which will +	 * If sane_behavior flag is set, we'll keep tasks in empty cpusets. +	 * +	 * Otherwise move tasks to the nearest ancestor with execution +	 * resources.  This is full cgroup operation which will  	 * also call back into cpuset.  Should be done outside any lock.  	 */ -	if (is_empty) +	if (!sane && is_empty)  		remove_tasks_in_empty_cpuset(cs); - -	/* the following may free @cs, should be the last operation */ -	css_put(&cs->css); -} - -/** - * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset - * @cs: cpuset of interest - * - * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and - * memory masks according to top_cpuset. - */ -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) -{ -	/* -	 * Pin @cs.  The refcnt will be released when the work item -	 * finishes executing. -	 */ -	if (!css_tryget(&cs->css)) -		return; - -	/* -	 * Queue @cs->hotplug_work.  If already pending, lose the css ref. -	 * cpuset_propagate_hotplug_wq is ordered and propagation will -	 * happen in the order this function is called. -	 */ -	if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) -		css_put(&cs->css);  }  /** @@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)   * actively using CPU hotplug but making no active use of cpusets.   *   * Non-root cpusets are only affected by offlining.  If any CPUs or memory - * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all - * descendants. + * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on + * all descendants.   *   * Note that CPU offlining during suspend is ignored.  We don't modify   * cpusets across suspend/resume cycles at all.   */  static void cpuset_hotplug_workfn(struct work_struct *work)  { -	static cpumask_t new_cpus, tmp_cpus; -	static nodemask_t new_mems, tmp_mems; +	static cpumask_t new_cpus; +	static nodemask_t new_mems;  	bool cpus_updated, mems_updated; -	bool cpus_offlined, mems_offlined;  	mutex_lock(&cpuset_mutex); @@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	new_mems = node_states[N_MEMORY];  	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); -	cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, -				       &new_cpus); -  	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); -	nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); -	mems_offlined = !nodes_empty(tmp_mems);  	/* synchronize cpus_allowed to cpu_active_mask */  	if (cpus_updated) { @@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	/* synchronize mems_allowed to N_MEMORY */  	if (mems_updated) { -		tmp_mems = top_cpuset.mems_allowed;  		mutex_lock(&callback_mutex);  		top_cpuset.mems_allowed = new_mems;  		mutex_unlock(&callback_mutex); -		update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); +		update_tasks_nodemask(&top_cpuset, NULL);  	} -	/* if cpus or mems went down, we need to propagate to descendants */ -	if (cpus_offlined || mems_offlined) { +	mutex_unlock(&cpuset_mutex); + +	/* if cpus or mems changed, we need to propagate to descendants */ +	if (cpus_updated || mems_updated) {  		struct cpuset *cs;  		struct cgroup *pos_cgrp;  		rcu_read_lock(); -		cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) -			schedule_cpuset_propagate_hotplug(cs); -		rcu_read_unlock(); -	} +		cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { +			if (!css_tryget(&cs->css)) +				continue; +			rcu_read_unlock(); -	mutex_unlock(&cpuset_mutex); +			cpuset_hotplug_update_tasks(cs); -	/* wait for propagations to finish */ -	flush_workqueue(cpuset_propagate_hotplug_wq); +			rcu_read_lock(); +			css_put(&cs->css); +		} +		rcu_read_unlock(); +	}  	/* rebuild sched domains if cpus_allowed has changed */  	if (cpus_updated) @@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void)  {  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);  	top_cpuset.mems_allowed = node_states[N_MEMORY]; +	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb); - -	cpuset_propagate_hotplug_wq = -		alloc_ordered_workqueue("cpuset_hotplug", 0); -	BUG_ON(!cpuset_propagate_hotplug_wq);  }  /** @@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void)  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)  { +	struct cpuset *cpus_cs; +  	mutex_lock(&callback_mutex);  	task_lock(tsk); -	guarantee_online_cpus(task_cs(tsk), pmask); +	cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); +	guarantee_online_cpus(cpus_cs, pmask);  	task_unlock(tsk);  	mutex_unlock(&callback_mutex);  }  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)  { -	const struct cpuset *cs; +	const struct cpuset *cpus_cs;  	rcu_read_lock(); -	cs = task_cs(tsk); -	if (cs) -		do_set_cpus_allowed(tsk, cs->cpus_allowed); +	cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); +	do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);  	rcu_read_unlock();  	/* @@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void)  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)  { +	struct cpuset *mems_cs;  	nodemask_t mask;  	mutex_lock(&callback_mutex);  	task_lock(tsk); -	guarantee_online_mems(task_cs(tsk), &mask); +	mems_cs = effective_nodemask_cpuset(task_cs(tsk)); +	guarantee_online_mems(mems_cs, &mask);  	task_unlock(tsk);  	mutex_unlock(&callback_mutex); diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..1db3af933704 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'  /*   * max perf event sample rate   */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = -	DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +#define DEFAULT_MAX_SAMPLE_RATE		100000 +#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) +#define DEFAULT_CPU_TIME_MAX_PERCENT	25 + +int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE; + +static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS; + +static atomic_t perf_sample_allowed_ns __read_mostly = +	ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); + +void update_perf_cpu_limits(void) +{ +	u64 tmp = perf_sample_period_ns; + +	tmp *= sysctl_perf_cpu_time_max_percent; +	tmp = do_div(tmp, 100); +	atomic_set(&perf_sample_allowed_ns, tmp); +} + +static int perf_rotate_context(struct perf_cpu_context *cpuctx);  int perf_proc_update_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp, @@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,  		return ret;  	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); +	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; +	update_perf_cpu_limits();  	return 0;  } +int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; + +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, +				void __user *buffer, size_t *lenp, +				loff_t *ppos) +{ +	int ret = proc_dointvec(table, write, buffer, lenp, ppos); + +	if (ret || !write) +		return ret; + +	update_perf_cpu_limits(); + +	return 0; +} + +/* + * perf samples are done in some very critical code paths (NMIs). + * If they take too much CPU time, the system can lock up and not + * get any real work done.  This will drop the sample rate when + * we detect that events are taking too long. + */ +#define NR_ACCUMULATED_SAMPLES 128 +DEFINE_PER_CPU(u64, running_sample_length); + +void perf_sample_event_took(u64 sample_len_ns) +{ +	u64 avg_local_sample_len; +	u64 local_samples_len = __get_cpu_var(running_sample_length); + +	if (atomic_read(&perf_sample_allowed_ns) == 0) +		return; + +	/* decay the counter by 1 average sample */ +	local_samples_len = __get_cpu_var(running_sample_length); +	local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; +	local_samples_len += sample_len_ns; +	__get_cpu_var(running_sample_length) = local_samples_len; + +	/* +	 * note: this will be biased artifically low until we have +	 * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us +	 * from having to maintain a count. +	 */ +	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + +	if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) +		return; + +	if (max_samples_per_tick <= 1) +		return; + +	max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); +	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; +	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + +	printk_ratelimited(KERN_WARNING +			"perf samples too long (%lld > %d), lowering " +			"kernel.perf_event_max_sample_rate to %d\n", +			avg_local_sample_len, +			atomic_read(&perf_sample_allowed_ns), +			sysctl_perf_event_sample_rate); + +	update_perf_cpu_limits(); +} +  static atomic64_t perf_event_id;  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, @@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,  static void update_context_time(struct perf_event_context *ctx);  static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, -			       struct ring_buffer *rb); -  void __weak perf_event_print_debug(void)	{ }  extern __weak const char *perf_pmu_name(void) @@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,  }  #endif +/* + * set default to be dependent on timer tick just + * like original code + */ +#define PERF_CPU_HRTIMER (1000 / HZ) +/* + * function must be called with interrupts disbled + */ +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +{ +	struct perf_cpu_context *cpuctx; +	enum hrtimer_restart ret = HRTIMER_NORESTART; +	int rotations = 0; + +	WARN_ON(!irqs_disabled()); + +	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); + +	rotations = perf_rotate_context(cpuctx); + +	/* +	 * arm timer if needed +	 */ +	if (rotations) { +		hrtimer_forward_now(hr, cpuctx->hrtimer_interval); +		ret = HRTIMER_RESTART; +	} + +	return ret; +} + +/* CPU is going down */ +void perf_cpu_hrtimer_cancel(int cpu) +{ +	struct perf_cpu_context *cpuctx; +	struct pmu *pmu; +	unsigned long flags; + +	if (WARN_ON(cpu != smp_processor_id())) +		return; + +	local_irq_save(flags); + +	rcu_read_lock(); + +	list_for_each_entry_rcu(pmu, &pmus, entry) { +		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + +		if (pmu->task_ctx_nr == perf_sw_context) +			continue; + +		hrtimer_cancel(&cpuctx->hrtimer); +	} + +	rcu_read_unlock(); + +	local_irq_restore(flags); +} + +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +{ +	struct hrtimer *hr = &cpuctx->hrtimer; +	struct pmu *pmu = cpuctx->ctx.pmu; +	int timer; + +	/* no multiplexing needed for SW PMU */ +	if (pmu->task_ctx_nr == perf_sw_context) +		return; + +	/* +	 * check default is sane, if not set then force to +	 * default interval (1/tick) +	 */ +	timer = pmu->hrtimer_interval_ms; +	if (timer < 1) +		timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + +	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + +	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); +	hr->function = perf_cpu_hrtimer_handler; +} + +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +{ +	struct hrtimer *hr = &cpuctx->hrtimer; +	struct pmu *pmu = cpuctx->ctx.pmu; + +	/* not for SW PMU */ +	if (pmu->task_ctx_nr == perf_sw_context) +		return; + +	if (hrtimer_active(hr)) +		return; + +	if (!hrtimer_callback_running(hr)) +		__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, +					 0, HRTIMER_MODE_REL_PINNED, 0); +} +  void perf_pmu_disable(struct pmu *pmu)  {  	int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,  	if (event_sched_in(group_event, cpuctx, ctx)) {  		pmu->cancel_txn(pmu); +		perf_cpu_hrtimer_restart(cpuctx);  		return -EAGAIN;  	} @@ -1552,6 +1736,8 @@ group_error:  	pmu->cancel_txn(pmu); +	perf_cpu_hrtimer_restart(cpuctx); +  	return -EAGAIN;  } @@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info)  		 * If this event can't go on and it's part of a  		 * group, then the whole group has to come off.  		 */ -		if (leader != event) +		if (leader != event) {  			group_sched_out(leader, cpuctx, ctx); +			perf_cpu_hrtimer_restart(cpuctx); +		}  		if (leader->attr.pinned) {  			update_group_times(leader);  			leader->state = PERF_EVENT_STATE_ERROR; @@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)   * because they're strictly cpu affine and rotate_start is called with IRQs   * disabled, while rotate_context is called from IRQ context.   */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) +static int perf_rotate_context(struct perf_cpu_context *cpuctx)  {  	struct perf_event_context *ctx = NULL;  	int rotate = 0, remove = 1; @@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)  done:  	if (remove)  		list_del_init(&cpuctx->rotation_list); + +	return rotate;  }  #ifdef CONFIG_NO_HZ_FULL @@ -2625,10 +2815,6 @@ void perf_event_task_tick(void)  		ctx = cpuctx->task_ctx;  		if (ctx)  			perf_adjust_freq_unthr_context(ctx, throttled); - -		if (cpuctx->jiffies_interval == 1 || -				!(jiffies % cpuctx->jiffies_interval)) -			perf_rotate_context(cpuctx);  	}  } @@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head)  }  static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);  static void free_event(struct perf_event *event)  { @@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event)  		if (has_branch_stack(event)) {  			static_key_slow_dec_deferred(&perf_sched_events);  			/* is system-wide event */ -			if (!(event->attach_state & PERF_ATTACH_TASK)) +			if (!(event->attach_state & PERF_ATTACH_TASK)) {  				atomic_dec(&per_cpu(perf_branch_stack_events,  						    event->cpu)); +			}  		}  	}  	if (event->rb) { -		ring_buffer_put(event->rb); -		event->rb = NULL; +		struct ring_buffer *rb; + +		/* +		 * Can happen when we close an event with re-directed output. +		 * +		 * Since we have a 0 refcount, perf_mmap_close() will skip +		 * over us; possibly making our ring_buffer_put() the last. +		 */ +		mutex_lock(&event->mmap_mutex); +		rb = event->rb; +		if (rb) { +			rcu_assign_pointer(event->rb, NULL); +			ring_buffer_detach(event, rb); +			ring_buffer_put(rb); /* could be last */ +		} +		mutex_unlock(&event->mmap_mutex);  	}  	if (is_cgroup_event(event)) @@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)  	unsigned int events = POLL_HUP;  	/* -	 * Race between perf_event_set_output() and perf_poll(): perf_poll() -	 * grabs the rb reference but perf_event_set_output() overrides it. -	 * Here is the timeline for two threads T1, T2: -	 * t0: T1, rb = rcu_dereference(event->rb) -	 * t1: T2, old_rb = event->rb -	 * t2: T2, event->rb = new rb -	 * t3: T2, ring_buffer_detach(old_rb) -	 * t4: T1, ring_buffer_attach(rb1) -	 * t5: T1, poll_wait(event->waitq) -	 * -	 * To avoid this problem, we grab mmap_mutex in perf_poll() -	 * thereby ensuring that the assignment of the new ring buffer -	 * and the detachment of the old buffer appear atomic to perf_poll() +	 * Pin the event->rb by taking event->mmap_mutex; otherwise +	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.  	 */  	mutex_lock(&event->mmap_mutex); - -	rcu_read_lock(); -	rb = rcu_dereference(event->rb); -	if (rb) { -		ring_buffer_attach(event, rb); +	rb = event->rb; +	if (rb)  		events = atomic_xchg(&rb->poll, 0); -	} -	rcu_read_unlock(); -  	mutex_unlock(&event->mmap_mutex);  	poll_wait(file, &event->waitq, wait); @@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event,  		return;  	spin_lock_irqsave(&rb->event_lock, flags); -	if (!list_empty(&event->rb_entry)) -		goto unlock; - -	list_add(&event->rb_entry, &rb->event_list); -unlock: +	if (list_empty(&event->rb_entry)) +		list_add(&event->rb_entry, &rb->event_list);  	spin_unlock_irqrestore(&rb->event_lock, flags);  } -static void ring_buffer_detach(struct perf_event *event, -			       struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)  {  	unsigned long flags; @@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event)  	rcu_read_lock();  	rb = rcu_dereference(event->rb); -	if (!rb) -		goto unlock; - -	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) -		wake_up_all(&event->waitq); - -unlock: +	if (rb) { +		list_for_each_entry_rcu(event, &rb->event_list, rb_entry) +			wake_up_all(&event->waitq); +	}  	rcu_read_unlock();  } @@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)  static void ring_buffer_put(struct ring_buffer *rb)  { -	struct perf_event *event, *n; -	unsigned long flags; -  	if (!atomic_dec_and_test(&rb->refcount))  		return; -	spin_lock_irqsave(&rb->event_lock, flags); -	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { -		list_del_init(&event->rb_entry); -		wake_up_all(&event->waitq); -	} -	spin_unlock_irqrestore(&rb->event_lock, flags); +	WARN_ON_ONCE(!list_empty(&rb->event_list));  	call_rcu(&rb->rcu_head, rb_free_rcu);  } @@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)  	struct perf_event *event = vma->vm_file->private_data;  	atomic_inc(&event->mmap_count); +	atomic_inc(&event->rb->mmap_count);  } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */  static void perf_mmap_close(struct vm_area_struct *vma)  {  	struct perf_event *event = vma->vm_file->private_data; -	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { -		unsigned long size = perf_data_size(event->rb); -		struct user_struct *user = event->mmap_user; -		struct ring_buffer *rb = event->rb; +	struct ring_buffer *rb = event->rb; +	struct user_struct *mmap_user = rb->mmap_user; +	int mmap_locked = rb->mmap_locked; +	unsigned long size = perf_data_size(rb); + +	atomic_dec(&rb->mmap_count); + +	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) +		return; -		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); -		vma->vm_mm->pinned_vm -= event->mmap_locked; -		rcu_assign_pointer(event->rb, NULL); -		ring_buffer_detach(event, rb); +	/* Detach current event from the buffer. */ +	rcu_assign_pointer(event->rb, NULL); +	ring_buffer_detach(event, rb); +	mutex_unlock(&event->mmap_mutex); + +	/* If there's still other mmap()s of this buffer, we're done. */ +	if (atomic_read(&rb->mmap_count)) { +		ring_buffer_put(rb); /* can't be last */ +		return; +	} + +	/* +	 * No other mmap()s, detach from all other events that might redirect +	 * into the now unreachable buffer. Somewhat complicated by the +	 * fact that rb::event_lock otherwise nests inside mmap_mutex. +	 */ +again: +	rcu_read_lock(); +	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { +		if (!atomic_long_inc_not_zero(&event->refcount)) { +			/* +			 * This event is en-route to free_event() which will +			 * detach it and remove it from the list. +			 */ +			continue; +		} +		rcu_read_unlock(); + +		mutex_lock(&event->mmap_mutex); +		/* +		 * Check we didn't race with perf_event_set_output() which can +		 * swizzle the rb from under us while we were waiting to +		 * acquire mmap_mutex. +		 * +		 * If we find a different rb; ignore this event, a next +		 * iteration will no longer find it on the list. We have to +		 * still restart the iteration to make sure we're not now +		 * iterating the wrong list. +		 */ +		if (event->rb == rb) { +			rcu_assign_pointer(event->rb, NULL); +			ring_buffer_detach(event, rb); +			ring_buffer_put(rb); /* can't be last, we still have one */ +		}  		mutex_unlock(&event->mmap_mutex); +		put_event(event); -		ring_buffer_put(rb); -		free_uid(user); +		/* +		 * Restart the iteration; either we're on the wrong list or +		 * destroyed its integrity by doing a deletion. +		 */ +		goto again;  	} +	rcu_read_unlock(); + +	/* +	 * It could be there's still a few 0-ref events on the list; they'll +	 * get cleaned up by free_event() -- they'll also still have their +	 * ref on the rb and will free it whenever they are done with it. +	 * +	 * Aside from that, this buffer is 'fully' detached and unmapped, +	 * undo the VM accounting. +	 */ + +	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); +	vma->vm_mm->pinned_vm -= mmap_locked; +	free_uid(mmap_user); + +	ring_buffer_put(rb); /* could be last */  }  static const struct vm_operations_struct perf_mmap_vmops = { @@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)  		return -EINVAL;  	WARN_ON_ONCE(event->ctx->parent_ctx); +again:  	mutex_lock(&event->mmap_mutex);  	if (event->rb) { -		if (event->rb->nr_pages == nr_pages) -			atomic_inc(&event->rb->refcount); -		else +		if (event->rb->nr_pages != nr_pages) {  			ret = -EINVAL; +			goto unlock; +		} + +		if (!atomic_inc_not_zero(&event->rb->mmap_count)) { +			/* +			 * Raced against perf_mmap_close() through +			 * perf_event_set_output(). Try again, hope for better +			 * luck. +			 */ +			mutex_unlock(&event->mmap_mutex); +			goto again; +		} +  		goto unlock;  	} @@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)  		ret = -ENOMEM;  		goto unlock;  	} -	rcu_assign_pointer(event->rb, rb); + +	atomic_set(&rb->mmap_count, 1); +	rb->mmap_locked = extra; +	rb->mmap_user = get_current_user();  	atomic_long_add(user_extra, &user->locked_vm); -	event->mmap_locked = extra; -	event->mmap_user = get_current_user(); -	vma->vm_mm->pinned_vm += event->mmap_locked; +	vma->vm_mm->pinned_vm += extra; + +	ring_buffer_attach(event, rb); +	rcu_assign_pointer(event->rb, rb);  	perf_event_update_userpage(event); @@ -3734,7 +3994,11 @@ unlock:  		atomic_inc(&event->mmap_count);  	mutex_unlock(&event->mmap_mutex); -	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; +	/* +	 * Since pinned accounting is per vm we cannot allow fork() to copy our +	 * vma. +	 */ +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;  	vma->vm_ops = &perf_mmap_vmops;  	return ret; @@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);   * sign as trigger.   */ -static u64 perf_swevent_set_period(struct perf_event *event) +u64 perf_swevent_set_period(struct perf_event *event)  {  	struct hw_perf_event *hwc = &event->hw;  	u64 period = hwc->last_period; @@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)  	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);  } +static ssize_t +perf_event_mux_interval_ms_show(struct device *dev, +				struct device_attribute *attr, +				char *page) +{ +	struct pmu *pmu = dev_get_drvdata(dev); + +	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); +} + +static ssize_t +perf_event_mux_interval_ms_store(struct device *dev, +				 struct device_attribute *attr, +				 const char *buf, size_t count) +{ +	struct pmu *pmu = dev_get_drvdata(dev); +	int timer, cpu, ret; + +	ret = kstrtoint(buf, 0, &timer); +	if (ret) +		return ret; + +	if (timer < 1) +		return -EINVAL; + +	/* same value, noting to do */ +	if (timer == pmu->hrtimer_interval_ms) +		return count; + +	pmu->hrtimer_interval_ms = timer; + +	/* update all cpuctx for this PMU */ +	for_each_possible_cpu(cpu) { +		struct perf_cpu_context *cpuctx; +		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); +		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + +		if (hrtimer_active(&cpuctx->hrtimer)) +			hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); +	} + +	return count; +} + +#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) +  static struct device_attribute pmu_dev_attrs[] = { -       __ATTR_RO(type), -       __ATTR_NULL, +	__ATTR_RO(type), +	__ATTR_RW(perf_event_mux_interval_ms), +	__ATTR_NULL,  };  static int pmu_bus_running; @@ -5952,7 +6263,7 @@ free_dev:  static struct lock_class_key cpuctx_mutex;  static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, char *name, int type) +int perf_pmu_register(struct pmu *pmu, const char *name, int type)  {  	int cpu, ret; @@ -6001,7 +6312,9 @@ skip_type:  		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);  		cpuctx->ctx.type = cpu_context;  		cpuctx->ctx.pmu = pmu; -		cpuctx->jiffies_interval = 1; + +		__perf_cpu_hrtimer_init(cpuctx, cpu); +  		INIT_LIST_HEAD(&cpuctx->rotation_list);  		cpuctx->unique_pmu = pmu;  	} @@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,  		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))  			return -EINVAL; -		/* kernel level capture: check permissions */ -		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) -		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) -			return -EACCES; -  		/* propagate priv level, when not set for branch */  		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { @@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,  			 */  			attr->branch_sample_type = mask;  		} +		/* privileged levels capture (kernel, hv): check permissions */ +		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) +		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +			return -EACCES;  	}  	if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -6412,6 +6724,8 @@ set:  	if (atomic_read(&event->mmap_count))  		goto unlock; +	old_rb = event->rb; +  	if (output_event) {  		/* get the rb we want to redirect to */  		rb = ring_buffer_get(output_event); @@ -6419,16 +6733,28 @@ set:  			goto unlock;  	} -	old_rb = event->rb; -	rcu_assign_pointer(event->rb, rb);  	if (old_rb)  		ring_buffer_detach(event, old_rb); + +	if (rb) +		ring_buffer_attach(event, rb); + +	rcu_assign_pointer(event->rb, rb); + +	if (old_rb) { +		ring_buffer_put(old_rb); +		/* +		 * Since we detached before setting the new rb, so that we +		 * could attach the new rb, we could have missed a wakeup. +		 * Provide it now. +		 */ +		wake_up_all(&event->waitq); +	} +  	ret = 0;  unlock:  	mutex_unlock(&event->mmap_mutex); -	if (old_rb) -		ring_buffer_put(old_rb);  out:  	return ret;  } @@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)  	case CPU_DOWN_PREPARE:  		perf_event_exit_cpu(cpu);  		break; -  	default:  		break;  	} diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1f..1559fb0b9296 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -46,23 +46,26 @@  #include <linux/smp.h>  #include <linux/hw_breakpoint.h> - -  /*   * Constraints data   */ +struct bp_cpuinfo { +	/* Number of pinned cpu breakpoints in a cpu */ +	unsigned int	cpu_pinned; +	/* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ +	unsigned int	*tsk_pinned; +	/* Number of non-pinned cpu/task breakpoints in a cpu */ +	unsigned int	flexible; /* XXX: placeholder, see fetch_this_slot() */ +}; -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - +static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);  static int nr_slots[TYPE_MAX]; +static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) +{ +	return per_cpu_ptr(bp_cpuinfo + type, cpu); +} +  /* Keep track of the breakpoints attached to tasks */  static LIST_HEAD(bp_task_head); @@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)   */  static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)  { +	unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;  	int i; -	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);  	for (i = nr_slots[type] - 1; i >= 0; i--) {  		if (tsk_pinned[i] > 0) @@ -120,13 +123,20 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)  	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {  		if (iter->hw.bp_target == tsk &&  		    find_slot_idx(iter) == type && -		    cpu == iter->cpu) +		    (iter->cpu < 0 || cpu == iter->cpu))  			count += hw_breakpoint_weight(iter);  	}  	return count;  } +static const struct cpumask *cpumask_of_bp(struct perf_event *bp) +{ +	if (bp->cpu >= 0) +		return cpumask_of(bp->cpu); +	return cpu_possible_mask; +} +  /*   * Report the number of pinned/un-pinned breakpoints we have in   * a given cpu (cpu > -1) or in all of them (cpu = -1). @@ -135,25 +145,15 @@ static void  fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,  		    enum bp_type_idx type)  { -	int cpu = bp->cpu; -	struct task_struct *tsk = bp->hw.bp_target; - -	if (cpu >= 0) { -		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); -		if (!tsk) -			slots->pinned += max_task_bp_pinned(cpu, type); -		else -			slots->pinned += task_bp_pinned(cpu, bp, type); -		slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - -		return; -	} +	const struct cpumask *cpumask = cpumask_of_bp(bp); +	int cpu; -	for_each_online_cpu(cpu) { -		unsigned int nr; +	for_each_cpu(cpu, cpumask) { +		struct bp_cpuinfo *info = get_bp_info(cpu, type); +		int nr; -		nr = per_cpu(nr_cpu_bp_pinned[type], cpu); -		if (!tsk) +		nr = info->cpu_pinned; +		if (!bp->hw.bp_target)  			nr += max_task_bp_pinned(cpu, type);  		else  			nr += task_bp_pinned(cpu, bp, type); @@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,  		if (nr > slots->pinned)  			slots->pinned = nr; -		nr = per_cpu(nr_bp_flexible[type], cpu); - +		nr = info->flexible;  		if (nr > slots->flexible)  			slots->flexible = nr;  	} @@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)  /*   * Add a pinned breakpoint for the given task in our constraint table   */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu,  				enum bp_type_idx type, int weight)  { -	unsigned int *tsk_pinned; -	int old_count = 0; -	int old_idx = 0; -	int idx = 0; - -	old_count = task_bp_pinned(cpu, bp, type); -	old_idx = old_count - 1; -	idx = old_idx + weight; - -	/* tsk_pinned[n] is the number of tasks having n breakpoints */ -	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); -	if (enable) { -		tsk_pinned[idx]++; -		if (old_count > 0) -			tsk_pinned[old_idx]--; -	} else { -		tsk_pinned[idx]--; -		if (old_count > 0) -			tsk_pinned[old_idx]++; -	} +	unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; +	int old_idx, new_idx; + +	old_idx = task_bp_pinned(cpu, bp, type) - 1; +	new_idx = old_idx + weight; + +	if (old_idx >= 0) +		tsk_pinned[old_idx]--; +	if (new_idx >= 0) +		tsk_pinned[new_idx]++;  }  /* @@ -214,33 +203,26 @@ static void  toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,  	       int weight)  { -	int cpu = bp->cpu; -	struct task_struct *tsk = bp->hw.bp_target; +	const struct cpumask *cpumask = cpumask_of_bp(bp); +	int cpu; -	/* Pinned counter cpu profiling */ -	if (!tsk) { +	if (!enable) +		weight = -weight; -		if (enable) -			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; -		else -			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; +	/* Pinned counter cpu profiling */ +	if (!bp->hw.bp_target) { +		get_bp_info(bp->cpu, type)->cpu_pinned += weight;  		return;  	}  	/* Pinned counter task profiling */ - -	if (!enable) -		list_del(&bp->hw.bp_list); - -	if (cpu >= 0) { -		toggle_bp_task_slot(bp, cpu, enable, type, weight); -	} else { -		for_each_online_cpu(cpu) -			toggle_bp_task_slot(bp, cpu, enable, type, weight); -	} +	for_each_cpu(cpu, cpumask) +		toggle_bp_task_slot(bp, cpu, type, weight);  	if (enable)  		list_add_tail(&bp->hw.bp_list, &bp_task_head); +	else +		list_del(&bp->hw.bp_list);  }  /* @@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)   *   *   - If attached to a single cpu, check:   * - *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + *       (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) + *           + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM   *   *       -> If there are already non-pinned counters in this cpu, it means   *          there is already a free slot for them. @@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)   *   *   - If attached to every cpus, check:   * - *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + *       (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) + *           + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM   *   *       -> This is roughly the same, except we check the number of per cpu   *          bp for every cpu and we keep the max one. Same for the per tasks @@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)   *   *   - If attached to a single cpu, check:   * - *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + *       ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) + *            + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM   * - *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep + *       -> Same checks as before. But now the info->flexible, if any, must keep   *          one register at least (or they will never be fed).   *   *   - If attached to every cpus, check:   * - *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + *       ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) + *            + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM   */  static int __reserve_bp_slot(struct perf_event *bp)  { @@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,  			    perf_overflow_handler_t triggered,  			    void *context)  { -	struct perf_event * __percpu *cpu_events, **pevent, *bp; -	long err; +	struct perf_event * __percpu *cpu_events, *bp; +	long err = 0;  	int cpu;  	cpu_events = alloc_percpu(typeof(*cpu_events)); @@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,  	get_online_cpus();  	for_each_online_cpu(cpu) { -		pevent = per_cpu_ptr(cpu_events, cpu);  		bp = perf_event_create_kernel_counter(attr, cpu, NULL,  						      triggered, context); - -		*pevent = bp; -  		if (IS_ERR(bp)) {  			err = PTR_ERR(bp); -			goto fail; +			break;  		} -	} -	put_online_cpus(); -	return cpu_events; - -fail: -	for_each_online_cpu(cpu) { -		pevent = per_cpu_ptr(cpu_events, cpu); -		if (IS_ERR(*pevent)) -			break; -		unregister_hw_breakpoint(*pevent); +		per_cpu(*cpu_events, cpu) = bp;  	}  	put_online_cpus(); -	free_percpu(cpu_events); +	if (likely(!err)) +		return cpu_events; + +	unregister_wide_hw_breakpoint(cpu_events);  	return (void __percpu __force *)ERR_PTR(err);  }  EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);  void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)  {  	int cpu; -	struct perf_event **pevent; -	for_each_possible_cpu(cpu) { -		pevent = per_cpu_ptr(cpu_events, cpu); -		unregister_hw_breakpoint(*pevent); -	} +	for_each_possible_cpu(cpu) +		unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); +  	free_percpu(cpu_events);  }  EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); @@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)  	if (!(flags & PERF_EF_START))  		bp->hw.state = PERF_HES_STOPPED; +	if (is_sampling_event(bp)) { +		bp->hw.last_period = bp->hw.sample_period; +		perf_swevent_set_period(bp); +	} +  	return arch_install_hw_breakpoint(bp);  } @@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {  int __init init_hw_breakpoint(void)  { -	unsigned int **task_bp_pinned;  	int cpu, err_cpu;  	int i; @@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)  	for_each_possible_cpu(cpu) {  		for (i = 0; i < TYPE_MAX; i++) { -			task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); -			*task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], -						  GFP_KERNEL); -			if (!*task_bp_pinned) +			struct bp_cpuinfo *info = get_bp_info(cpu, i); + +			info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), +							GFP_KERNEL); +			if (!info->tsk_pinned)  				goto err_alloc;  		}  	} @@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)   err_alloc:  	for_each_possible_cpu(err_cpu) {  		for (i = 0; i < TYPE_MAX; i++) -			kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); +			kfree(get_bp_info(err_cpu, i)->tsk_pinned);  		if (err_cpu == cpu)  			break;  	} diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,10 @@ struct ring_buffer {  	spinlock_t			event_lock;  	struct list_head		event_list; +	atomic_t			mmap_count; +	unsigned long			mmap_locked; +	struct user_struct		*mmap_user; +  	struct perf_event_mmap_page	*user_page;  	void				*data_pages[0];  }; diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd499..fafe75d9e6f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)  	}  } -void __set_special_pids(struct pid *pid) -{ -	struct task_struct *curr = current->group_leader; - -	if (task_session(curr) != pid) -		change_pid(curr, PIDTYPE_SID, pid); - -	if (task_pgrp(curr) != pid) -		change_pid(curr, PIDTYPE_PGID, pid); -} -  /*   * Let kernel threads use this to say that they allow a certain signal.   * Must not be used if kthread was cloned with CLONE_SIGHAND. @@ -649,7 +638,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)  	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)  	 */  	forget_original_parent(tsk); -	exit_task_namespaces(tsk);  	write_lock_irq(&tasklist_lock);  	if (group_dead) @@ -795,6 +783,7 @@ void do_exit(long code)  	exit_shm(tsk);  	exit_files(tsk);  	exit_fs(tsk); +	exit_task_namespaces(tsk);  	exit_task_work(tsk);  	check_stack_usage();  	exit_thread(); @@ -835,7 +824,7 @@ void do_exit(long code)  	/*  	 * Make sure we are holding no locks:  	 */ -	debug_check_no_locks_held(tsk); +	debug_check_no_locks_held();  	/*  	 * We can do this unlocked here. The futex code uses this flag  	 * just to verify whether the pi state cleanup has been done diff --git a/kernel/fork.c b/kernel/fork.c index 987b28a1f01b..6e6a1c11b3e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1121,6 +1121,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)  	INIT_LIST_HEAD(&tsk->cpu_timers[2]);  } +static inline void +init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) +{ +	 task->pids[type].pid = pid; +} +  /*   * This creates a new process as a copy of the old one,   * but does not actually start it yet. @@ -1199,8 +1205,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	retval = -EAGAIN;  	if (atomic_read(&p->real_cred->user->processes) >=  			task_rlimit(p, RLIMIT_NPROC)) { -		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && -		    p->real_cred->user != INIT_USER) +		if (p->real_cred->user != INIT_USER && +		    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))  			goto bad_fork_free;  	}  	current->flags &= ~PF_NPROC_EXCEEDED; @@ -1354,11 +1360,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  			goto bad_fork_cleanup_io;  	} -	p->pid = pid_nr(pid); -	p->tgid = p->pid; -	if (clone_flags & CLONE_THREAD) -		p->tgid = current->tgid; -  	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;  	/*  	 * Clear TID on mm_release()? @@ -1394,12 +1395,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	clear_all_latency_tracing(p);  	/* ok, now we should be set up.. */ -	if (clone_flags & CLONE_THREAD) +	p->pid = pid_nr(pid); +	if (clone_flags & CLONE_THREAD) {  		p->exit_signal = -1; -	else if (clone_flags & CLONE_PARENT) -		p->exit_signal = current->group_leader->exit_signal; -	else -		p->exit_signal = (clone_flags & CSIGNAL); +		p->group_leader = current->group_leader; +		p->tgid = current->tgid; +	} else { +		if (clone_flags & CLONE_PARENT) +			p->exit_signal = current->group_leader->exit_signal; +		else +			p->exit_signal = (clone_flags & CSIGNAL); +		p->group_leader = p; +		p->tgid = p->pid; +	}  	p->pdeath_signal = 0;  	p->exit_state = 0; @@ -1408,15 +1416,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);  	p->dirty_paused_when = 0; -	/* -	 * Ok, make it visible to the rest of the system. -	 * We dont wake it up yet. -	 */ -	p->group_leader = p;  	INIT_LIST_HEAD(&p->thread_group);  	p->task_works = NULL; -	/* Need tasklist lock for parent etc handling! */ +	/* +	 * Make it visible to the rest of the system, but dont wake it up yet. +	 * Need tasklist lock for parent etc handling! +	 */  	write_lock_irq(&tasklist_lock);  	/* CLONE_PARENT re-uses the old parent */ @@ -1446,18 +1452,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		goto bad_fork_free_pid;  	} -	if (clone_flags & CLONE_THREAD) { -		current->signal->nr_threads++; -		atomic_inc(¤t->signal->live); -		atomic_inc(¤t->signal->sigcnt); -		p->group_leader = current->group_leader; -		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); -	} -  	if (likely(p->pid)) {  		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); +		init_task_pid(p, PIDTYPE_PID, pid);  		if (thread_group_leader(p)) { +			init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); +			init_task_pid(p, PIDTYPE_SID, task_session(current)); +  			if (is_child_reaper(pid)) {  				ns_of_pid(pid)->child_reaper = p;  				p->signal->flags |= SIGNAL_UNKILLABLE; @@ -1465,13 +1467,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,  			p->signal->leader_pid = pid;  			p->signal->tty = tty_kref_get(current->signal->tty); -			attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); -			attach_pid(p, PIDTYPE_SID, task_session(current));  			list_add_tail(&p->sibling, &p->real_parent->children);  			list_add_tail_rcu(&p->tasks, &init_task.tasks); +			attach_pid(p, PIDTYPE_PGID); +			attach_pid(p, PIDTYPE_SID);  			__this_cpu_inc(process_counts); +		} else { +			current->signal->nr_threads++; +			atomic_inc(¤t->signal->live); +			atomic_inc(¤t->signal->sigcnt); +			list_add_tail_rcu(&p->thread_group, +					  &p->group_leader->thread_group);  		} -		attach_pid(p, PIDTYPE_PID, pid); +		attach_pid(p, PIDTYPE_PID);  		nr_threads++;  	} diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efba..8b2afc1c9df0 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p)  {  	unsigned long flags; +	/* +	 * This check can race with freezer_do_not_count, but worst case that +	 * will result in an extra wakeup being sent to the task.  It does not +	 * race with freezer_count(), the barriers in freezer_count() and +	 * freezer_should_skip() ensure that either freezer_count() sees +	 * freezing == true in try_to_freeze() and freezes, or +	 * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task +	 * normally. +	 */ +	if (freezer_should_skip(p)) +		return false; +  	spin_lock_irqsave(&freezer_lock, flags);  	if (!freezing(p) || frozen(p)) {  		spin_unlock_irqrestore(&freezer_lock, flags); diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c94..c3a1a55a5214 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,8 @@  #include <linux/nsproxy.h>  #include <linux/ptrace.h>  #include <linux/sched/rt.h> +#include <linux/hugetlb.h> +#include <linux/freezer.h>  #include <asm/futex.h> @@ -365,7 +367,7 @@ again:  	} else {  		key->both.offset |= FUT_OFF_INODE; /* inode-based key */  		key->shared.inode = page_head->mapping->host; -		key->shared.pgoff = page_head->index; +		key->shared.pgoff = basepage_index(page);  	}  	get_futex_key_refs(key); @@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,  		 * is no timeout, or if it has yet to expire.  		 */  		if (!timeout || timeout->task) -			schedule(); +			freezable_schedule();  	}  	__set_current_state(TASK_RUNNING);  } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f8..3ee4d06c6fc2 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,6 +47,7 @@  #include <linux/sched/sysctl.h>  #include <linux/sched/rt.h>  #include <linux/timer.h> +#include <linux/freezer.h>  #include <asm/uaccess.h> @@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod  			t->task = NULL;  		if (likely(t->task)) -			schedule(); +			freezable_schedule();  		hrtimer_cancel(&t->timer);  		mode = HRTIMER_MODE_ABS; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cbd97ce0b000..a3bb14fbe5c6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc)  	irq_state_clr_masked(desc);  } +/** + * irq_disable - Mark interupt disabled + * @desc:	irq descriptor which should be disabled + * + * If the chip does not implement the irq_disable callback, we + * use a lazy disable approach. That means we mark the interrupt + * disabled, but leave the hardware unmasked. That's an + * optimization because we avoid the hardware access for the + * common case where no interrupt happens after we marked it + * disabled. If an interrupt happens, then the interrupt flow + * handler masks the line at the hardware level and marks it + * pending. + */  void irq_disable(struct irq_desc *desc)  {  	irq_state_set_disabled(desc); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f668..1c39eccc1eaf 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -7,6 +7,7 @@  #include <linux/irq.h>  #include <linux/slab.h>  #include <linux/export.h> +#include <linux/irqdomain.h>  #include <linux/interrupt.h>  #include <linux/kernel_stat.h>  #include <linux/syscore_ops.h> @@ -16,11 +17,6 @@  static LIST_HEAD(gc_list);  static DEFINE_RAW_SPINLOCK(gc_lock); -static inline struct irq_chip_regs *cur_regs(struct irq_data *d) -{ -	return &container_of(d->chip, struct irq_chip_type, chip)->regs; -} -  /**   * irq_gc_noop - NOOP function   * @d: irq_data @@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d)  void irq_gc_mask_disable_reg(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); -	gc->mask_cache &= ~mask; +	irq_reg_writel(mask, gc->reg_base + ct->regs.disable); +	*ct->mask_cache &= ~mask;  	irq_gc_unlock(gc);  }  /** - * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * irq_gc_mask_set_bit - Mask chip via setting bit in mask register   * @d: irq_data   *   * Chip has a single mask register. Values of this register are cached @@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d)  void irq_gc_mask_set_bit(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	gc->mask_cache |= mask; -	irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); +	*ct->mask_cache |= mask; +	irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);  	irq_gc_unlock(gc);  } +EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);  /** - * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register   * @d: irq_data   *   * Chip has a single mask register. Values of this register are cached @@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d)  void irq_gc_mask_clr_bit(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	gc->mask_cache &= ~mask; -	irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); +	*ct->mask_cache &= ~mask; +	irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);  	irq_gc_unlock(gc);  } +EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);  /**   * irq_gc_unmask_enable_reg - Unmask chip via enable register @@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d)  void irq_gc_unmask_enable_reg(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); -	gc->mask_cache |= mask; +	irq_reg_writel(mask, gc->reg_base + ct->regs.enable); +	*ct->mask_cache |= mask;  	irq_gc_unlock(gc);  } @@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)  void irq_gc_ack_set_bit(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); +	irq_reg_writel(mask, gc->reg_base + ct->regs.ack);  	irq_gc_unlock(gc);  } +EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);  /**   * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit @@ -122,10 +126,11 @@ void irq_gc_ack_set_bit(struct irq_data *d)  void irq_gc_ack_clr_bit(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = ~(1 << (d->irq - gc->irq_base)); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = ~d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); +	irq_reg_writel(mask, gc->reg_base + ct->regs.ack);  	irq_gc_unlock(gc);  } @@ -136,11 +141,12 @@ void irq_gc_ack_clr_bit(struct irq_data *d)  void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); +	irq_reg_writel(mask, gc->reg_base + ct->regs.mask); +	irq_reg_writel(mask, gc->reg_base + ct->regs.ack);  	irq_gc_unlock(gc);  } @@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)  void irq_gc_eoi(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	struct irq_chip_type *ct = irq_data_get_chip_type(d); +	u32 mask = d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); +	irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);  	irq_gc_unlock(gc);  }  /**   * irq_gc_set_wake - Set/clr wake bit for an interrupt - * @d: irq_data + * @d:  irq_data + * @on: Indicates whether the wake bit should be set or cleared   *   * For chips where the wake from suspend functionality is not   * configured in a separate register and the wakeup active state is @@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d)  int irq_gc_set_wake(struct irq_data *d, unsigned int on)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); -	u32 mask = 1 << (d->irq - gc->irq_base); +	u32 mask = d->mask;  	if (!(mask & gc->wake_enabled))  		return -EINVAL; @@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)  	return 0;  } +static void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, +		      int num_ct, unsigned int irq_base, +		      void __iomem *reg_base, irq_flow_handler_t handler) +{ +	raw_spin_lock_init(&gc->lock); +	gc->num_ct = num_ct; +	gc->irq_base = irq_base; +	gc->reg_base = reg_base; +	gc->chip_types->chip.name = name; +	gc->chip_types->handler = handler; +} +  /**   * irq_alloc_generic_chip - Allocate a generic chip and initialize it   * @name:	Name of the irq chip @@ -203,23 +224,185 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,  	gc = kzalloc(sz, GFP_KERNEL);  	if (gc) { -		raw_spin_lock_init(&gc->lock); -		gc->num_ct = num_ct; -		gc->irq_base = irq_base; -		gc->reg_base = reg_base; -		gc->chip_types->chip.name = name; -		gc->chip_types->handler = handler; +		irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, +				      handler);  	}  	return gc;  }  EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); +static void +irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) +{ +	struct irq_chip_type *ct = gc->chip_types; +	u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; +	int i; + +	for (i = 0; i < gc->num_ct; i++) { +		if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { +			mskptr = &ct[i].mask_cache_priv; +			mskreg = ct[i].regs.mask; +		} +		ct[i].mask_cache = mskptr; +		if (flags & IRQ_GC_INIT_MASK_CACHE) +			*mskptr = irq_reg_readl(gc->reg_base + mskreg); +	} +} + +/** + * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * @d:			irq domain for which to allocate chips + * @irqs_per_chip:	Number of interrupts each chip handles + * @num_ct:		Number of irq_chip_type instances associated with this + * @name:		Name of the irq chip + * @handler:		Default flow handler associated with these chips + * @clr:		IRQ_* bits to clear in the mapping function + * @set:		IRQ_* bits to set in the mapping function + * @gcflags:		Generic chip specific setup flags + */ +int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, +				   int num_ct, const char *name, +				   irq_flow_handler_t handler, +				   unsigned int clr, unsigned int set, +				   enum irq_gc_flags gcflags) +{ +	struct irq_domain_chip_generic *dgc; +	struct irq_chip_generic *gc; +	int numchips, sz, i; +	unsigned long flags; +	void *tmp; + +	if (d->gc) +		return -EBUSY; + +	if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) +		return -EINVAL; + +	numchips = d->revmap_data.linear.size / irqs_per_chip; +	if (!numchips) +		return -EINVAL; + +	/* Allocate a pointer, generic chip and chiptypes for each chip */ +	sz = sizeof(*dgc) + numchips * sizeof(gc); +	sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + +	tmp = dgc = kzalloc(sz, GFP_KERNEL); +	if (!dgc) +		return -ENOMEM; +	dgc->irqs_per_chip = irqs_per_chip; +	dgc->num_chips = numchips; +	dgc->irq_flags_to_set = set; +	dgc->irq_flags_to_clear = clr; +	dgc->gc_flags = gcflags; +	d->gc = dgc; + +	/* Calc pointer to the first generic chip */ +	tmp += sizeof(*dgc) + numchips * sizeof(gc); +	for (i = 0; i < numchips; i++) { +		/* Store the pointer to the generic chip */ +		dgc->gc[i] = gc = tmp; +		irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, +				      NULL, handler); +		gc->domain = d; +		raw_spin_lock_irqsave(&gc_lock, flags); +		list_add_tail(&gc->list, &gc_list); +		raw_spin_unlock_irqrestore(&gc_lock, flags); +		/* Calc pointer to the next generic chip */ +		tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); +	} +	return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); + +/** + * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq + * @d:			irq domain pointer + * @hw_irq:		Hardware interrupt number + */ +struct irq_chip_generic * +irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ +	struct irq_domain_chip_generic *dgc = d->gc; +	int idx; + +	if (!dgc) +		return NULL; +	idx = hw_irq / dgc->irqs_per_chip; +	if (idx >= dgc->num_chips) +		return NULL; +	return dgc->gc[idx]; +} +EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); +  /*   * Separate lockdep class for interrupt chip which can nest irq_desc   * lock.   */  static struct lock_class_key irq_nested_lock_class; +/* + * irq_map_generic_chip - Map a generic chip for an irq domain + */ +static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, +				irq_hw_number_t hw_irq) +{ +	struct irq_data *data = irq_get_irq_data(virq); +	struct irq_domain_chip_generic *dgc = d->gc; +	struct irq_chip_generic *gc; +	struct irq_chip_type *ct; +	struct irq_chip *chip; +	unsigned long flags; +	int idx; + +	if (!d->gc) +		return -ENODEV; + +	idx = hw_irq / dgc->irqs_per_chip; +	if (idx >= dgc->num_chips) +		return -EINVAL; +	gc = dgc->gc[idx]; + +	idx = hw_irq % dgc->irqs_per_chip; + +	if (test_bit(idx, &gc->unused)) +		return -ENOTSUPP; + +	if (test_bit(idx, &gc->installed)) +		return -EBUSY; + +	ct = gc->chip_types; +	chip = &ct->chip; + +	/* We only init the cache for the first mapping of a generic chip */ +	if (!gc->installed) { +		raw_spin_lock_irqsave(&gc->lock, flags); +		irq_gc_init_mask_cache(gc, dgc->gc_flags); +		raw_spin_unlock_irqrestore(&gc->lock, flags); +	} + +	/* Mark the interrupt as installed */ +	set_bit(idx, &gc->installed); + +	if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) +		irq_set_lockdep_class(virq, &irq_nested_lock_class); + +	if (chip->irq_calc_mask) +		chip->irq_calc_mask(data); +	else +		data->mask = 1 << idx; + +	irq_set_chip_and_handler(virq, chip, ct->handler); +	irq_set_chip_data(virq, gc); +	irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); +	return 0; +} + +struct irq_domain_ops irq_generic_chip_ops = { +	.map	= irq_map_generic_chip, +	.xlate	= irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_generic_chip_ops); +  /**   * irq_setup_generic_chip - Setup a range of interrupts with a generic chip   * @gc:		Generic irq chip holding all data @@ -237,15 +420,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,  			    unsigned int set)  {  	struct irq_chip_type *ct = gc->chip_types; +	struct irq_chip *chip = &ct->chip;  	unsigned int i;  	raw_spin_lock(&gc_lock);  	list_add_tail(&gc->list, &gc_list);  	raw_spin_unlock(&gc_lock); -	/* Init mask cache ? */ -	if (flags & IRQ_GC_INIT_MASK_CACHE) -		gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); +	irq_gc_init_mask_cache(gc, flags);  	for (i = gc->irq_base; msk; msk >>= 1, i++) {  		if (!(msk & 0x01)) @@ -254,7 +436,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,  		if (flags & IRQ_GC_INIT_NESTED_LOCK)  			irq_set_lockdep_class(i, &irq_nested_lock_class); -		irq_set_chip_and_handler(i, &ct->chip, ct->handler); +		if (!(flags & IRQ_GC_NO_MASK)) { +			struct irq_data *d = irq_get_irq_data(i); + +			if (chip->irq_calc_mask) +				chip->irq_calc_mask(d); +			else +				d->mask = 1 << (i - gc->irq_base); +		} +		irq_set_chip_and_handler(i, chip, ct->handler);  		irq_set_chip_data(i, gc);  		irq_modify_status(i, clr, set);  	} @@ -265,7 +455,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip);  /**   * irq_setup_alt_chip - Switch to alternative chip   * @d:		irq_data for this interrupt - * @type	Flow type to be initialized + * @type:	Flow type to be initialized   *   * Only to be called from chip->irq_set_type() callbacks.   */ @@ -317,6 +507,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,  }  EXPORT_SYMBOL_GPL(irq_remove_generic_chip); +static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) +{ +	unsigned int virq; + +	if (!gc->domain) +		return irq_get_irq_data(gc->irq_base); + +	/* +	 * We don't know which of the irqs has been actually +	 * installed. Use the first one. +	 */ +	if (!gc->installed) +		return NULL; + +	virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed)); +	return virq ? irq_get_irq_data(virq) : NULL; +} +  #ifdef CONFIG_PM  static int irq_gc_suspend(void)  { @@ -325,8 +533,12 @@ static int irq_gc_suspend(void)  	list_for_each_entry(gc, &gc_list, list) {  		struct irq_chip_type *ct = gc->chip_types; -		if (ct->chip.irq_suspend) -			ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); +		if (ct->chip.irq_suspend) { +			struct irq_data *data = irq_gc_get_irq_data(gc); + +			if (data) +				ct->chip.irq_suspend(data); +		}  	}  	return 0;  } @@ -338,8 +550,12 @@ static void irq_gc_resume(void)  	list_for_each_entry(gc, &gc_list, list) {  		struct irq_chip_type *ct = gc->chip_types; -		if (ct->chip.irq_resume) -			ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); +		if (ct->chip.irq_resume) { +			struct irq_data *data = irq_gc_get_irq_data(gc); + +			if (data) +				ct->chip.irq_resume(data); +		}  	}  }  #else @@ -354,8 +570,12 @@ static void irq_gc_shutdown(void)  	list_for_each_entry(gc, &gc_list, list) {  		struct irq_chip_type *ct = gc->chip_types; -		if (ct->chip.irq_pm_shutdown) -			ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); +		if (ct->chip.irq_pm_shutdown) { +			struct irq_data *data = irq_gc_get_irq_data(gc); + +			if (data) +				ct->chip.irq_pm_shutdown(data); +		}  	}  } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 54a4d5223238..1ed8dff17eb9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -16,12 +16,6 @@  #include <linux/smp.h>  #include <linux/fs.h> -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. -				 * ie. legacy 8259, gets irqs 1..15 */ -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ -  static LIST_HEAD(irq_domain_list);  static DEFINE_MUTEX(irq_domain_mutex); @@ -698,7 +692,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,  	/* Set type if specified and different than the current one */  	if (type != IRQ_TYPE_NONE && -	    type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) +	    type != irq_get_trigger_type(virq))  		irq_set_irq_type(virq, type);  	return virq;  } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa17855ca65a..514bcfd855a8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)  		return 0;  	if (irq_settings_can_request(desc)) { -		if (desc->action) -			if (irqflags & desc->action->flags & IRQF_SHARED) -				canrequest =1; +		if (!desc->action || +		    irqflags & desc->action->flags & IRQF_SHARED) +			canrequest = 1;  	}  	irq_put_desc_unlock(desc, flags);  	return canrequest; @@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused)  static int irq_thread(void *data)  {  	struct callback_head on_exit_work; -	static const struct sched_param param = { -		.sched_priority = MAX_USER_RT_PRIO/2, -	};  	struct irqaction *action = data;  	struct irq_desc *desc = irq_to_desc(action->irq);  	irqreturn_t (*handler_fn)(struct irq_desc *desc, @@ -854,8 +851,6 @@ static int irq_thread(void *data)  	else  		handler_fn = irq_thread_fn; -	sched_setscheduler(current, SCHED_FIFO, ¶m); -  	init_task_work(&on_exit_work, irq_thread_dtor);  	task_work_add(current, &on_exit_work, false); @@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	 */  	if (new->thread_fn && !nested) {  		struct task_struct *t; +		static const struct sched_param param = { +			.sched_priority = MAX_USER_RT_PRIO/2, +		};  		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,  				   new->name); @@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  			ret = PTR_ERR(t);  			goto out_mput;  		} + +		sched_setscheduler(t, SCHED_FIFO, ¶m); +  		/*  		 * We keep the reference to the task struct even if  		 * the thread dies to avoid that the interrupt code diff --git a/kernel/kmod.c b/kernel/kmod.c index 8241906c4b61..fb326365b694 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...)  	 */  	WARN_ON_ONCE(wait && current_is_async()); +	if (!modprobe_path[0]) +		return 0; +  	va_start(args, fmt);  	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);  	va_end(args); @@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)  	int retval = 0;  	helper_lock(); -	if (!sub_info->path) { -		retval = -EINVAL; -		goto out; -	} - -	if (sub_info->path[0] == '\0') -		goto out; -  	if (!khelper_wq || usermodehelper_disabled) {  		retval = -EBUSY;  		goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcdf..6e33498d665c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)  /* Optimization staging list, protected by kprobe_mutex */  static LIST_HEAD(optimizing_list);  static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list);  static void kprobe_optimizer(struct work_struct *work);  static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)   * Unoptimize (replace a jump with a breakpoint and remove the breakpoint   * if need) kprobes listed on unoptimizing_list.   */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void)  {  	struct optimized_kprobe *op, *tmp; @@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)  	/* Ditto to do_optimize_kprobes */  	get_online_cpus();  	mutex_lock(&text_mutex); -	arch_unoptimize_kprobes(&unoptimizing_list, free_list); +	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);  	/* Loop free_list for disarming */ -	list_for_each_entry_safe(op, tmp, free_list, list) { +	list_for_each_entry_safe(op, tmp, &freeing_list, list) {  		/* Disarm probes if marked disabled */  		if (kprobe_disabled(&op->kp))  			arch_disarm_kprobe(&op->kp); @@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)  }  /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void)  {  	struct optimized_kprobe *op, *tmp; -	list_for_each_entry_safe(op, tmp, free_list, list) { +	list_for_each_entry_safe(op, tmp, &freeing_list, list) {  		BUG_ON(!kprobe_unused(&op->kp));  		list_del_init(&op->list);  		free_aggr_kprobe(&op->kp); @@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)  /* Kprobe jump optimizer */  static __kprobes void kprobe_optimizer(struct work_struct *work)  { -	LIST_HEAD(free_list); -  	mutex_lock(&kprobe_mutex);  	/* Lock modules while optimizing kprobes */  	mutex_lock(&module_mutex); @@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)  	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)  	 * kprobes before waiting for quiesence period.  	 */ -	do_unoptimize_kprobes(&free_list); +	do_unoptimize_kprobes();  	/*  	 * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)  	do_optimize_kprobes();  	/* Step 4: Free cleaned kprobes after quiesence period */ -	do_free_cleaned_kprobes(&free_list); +	do_free_cleaned_kprobes();  	mutex_unlock(&module_mutex);  	mutex_unlock(&kprobe_mutex); @@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)  	if (!list_empty(&op->list))  		/* Dequeue from the (un)optimization queue */  		list_del_init(&op->list); -  	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + +	if (kprobe_unused(p)) { +		/* Enqueue if it is unused */ +		list_add(&op->list, &freeing_list); +		/* +		 * Remove unused probes from the hash list. After waiting +		 * for synchronization, this probe is reclaimed. +		 * (reclaiming is done by do_free_cleaned_kprobes().) +		 */ +		hlist_del_rcu(&op->kp.hlist); +	} +  	/* Don't touch the code, because it is already freed. */  	arch_remove_optimized_kprobe(op);  } @@ -2322,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file,  	if (copy_from_user(buf, user_buf, buf_size))  		return -EFAULT; +	buf[buf_size] = '\0';  	switch (buf[0]) {  	case 'y':  	case 'Y': @@ -2333,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file,  	case '0':  		disarm_all_kprobes();  		break; +	default: +		return -EINVAL;  	}  	return count; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1f3186b37fd5..e16c45b9ee77 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)  }  EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void)  {  	if (!debug_locks_off())  		return; @@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr)  	printk("\n");  	printk("=====================================\n"); -	printk("[ BUG: lock held at task exit time! ]\n"); +	printk("[ BUG: %s/%d still has locks held! ]\n", +	       current->comm, task_pid_nr(current));  	print_kernel_ident();  	printk("-------------------------------------\n"); -	printk("%s/%d is exiting with locks still held!\n", -		curr->comm, task_pid_nr(curr)); -	lockdep_print_held_locks(curr); - +	lockdep_print_held_locks(current);  	printk("\nstack backtrace:\n");  	dump_stack();  } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void)  { -	if (unlikely(task->lockdep_depth > 0)) -		print_held_locks_bug(task); +	if (unlikely(current->lockdep_depth > 0)) +		print_held_locks_bug();  } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held);  void debug_show_all_locks(void)  { diff --git a/kernel/mutex.c b/kernel/mutex.c index ad53a664f113..e581ada5faf4 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock)  EXPORT_SYMBOL(mutex_unlock); +/** + * ww_mutex_unlock - release the w/w mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously with any of the + * ww_mutex_lock* functions (with or without an acquire context). It is + * forbidden to release the locks after releasing the acquire context. + * + * This function must not be used in interrupt context. Unlocking + * of a unlocked mutex is not allowed. + */ +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ +	/* +	 * The unlocking fastpath is the 0->1 transition from 'locked' +	 * into 'unlocked' state: +	 */ +	if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES +		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif +		if (lock->ctx->acquired > 0) +			lock->ctx->acquired--; +		lock->ctx = NULL; +	} + +#ifndef CONFIG_DEBUG_MUTEXES +	/* +	 * When debugging is enabled we must not clear the owner before time, +	 * the slow path will always be taken, and that clears the owner field +	 * after verifying that it was indeed current. +	 */ +	mutex_clear_owner(&lock->base); +#endif +	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +static inline int __sched +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +{ +	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); +	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + +	if (!hold_ctx) +		return 0; + +	if (unlikely(ctx == hold_ctx)) +		return -EALREADY; + +	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && +	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES +		DEBUG_LOCKS_WARN_ON(ctx->contending_lock); +		ctx->contending_lock = ww; +#endif +		return -EDEADLK; +	} + +	return 0; +} + +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, +						   struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES +	/* +	 * If this WARN_ON triggers, you used ww_mutex_lock to acquire, +	 * but released with a normal mutex_unlock in this call. +	 * +	 * This should never happen, always use ww_mutex_unlock. +	 */ +	DEBUG_LOCKS_WARN_ON(ww->ctx); + +	/* +	 * Not quite done after calling ww_acquire_done() ? +	 */ +	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + +	if (ww_ctx->contending_lock) { +		/* +		 * After -EDEADLK you tried to +		 * acquire a different ww_mutex? Bad! +		 */ +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + +		/* +		 * You called ww_mutex_lock after receiving -EDEADLK, +		 * but 'forgot' to unlock everything else first? +		 */ +		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); +		ww_ctx->contending_lock = NULL; +	} + +	/* +	 * Naughty, using a different class will lead to undefined behavior! +	 */ +	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif +	ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, +			       struct ww_acquire_ctx *ctx) +{ +	unsigned long flags; +	struct mutex_waiter *cur; + +	ww_mutex_lock_acquired(lock, ctx); + +	lock->ctx = ctx; + +	/* +	 * The lock->ctx update should be visible on all cores before +	 * the atomic read is done, otherwise contended waiters might be +	 * missed. The contended waiters will either see ww_ctx == NULL +	 * and keep spinning, or it will acquire wait_lock, add itself +	 * to waiter list and sleep. +	 */ +	smp_mb(); /* ^^^ */ + +	/* +	 * Check if lock is contended, if not there is nobody to wake up +	 */ +	if (likely(atomic_read(&lock->base.count) == 0)) +		return; + +	/* +	 * Uh oh, we raced in fastpath, wake up everyone in this case, +	 * so they can see the new lock->ctx. +	 */ +	spin_lock_mutex(&lock->base.wait_lock, flags); +	list_for_each_entry(cur, &lock->base.wait_list, list) { +		debug_mutex_wake_waiter(&lock->base, cur); +		wake_up_process(cur->task); +	} +	spin_unlock_mutex(&lock->base.wait_lock, flags); +} +  /*   * Lock a mutex (possibly interruptible), slowpath:   */ -static inline int __sched +static __always_inline int __sched  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, -		    struct lockdep_map *nest_lock, unsigned long ip) +		    struct lockdep_map *nest_lock, unsigned long ip, +		    struct ww_acquire_ctx *ww_ctx)  {  	struct task_struct *task = current;  	struct mutex_waiter waiter;  	unsigned long flags; +	int ret;  	preempt_disable();  	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		struct task_struct *owner;  		struct mspin_node  node; +		if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { +			struct ww_mutex *ww; + +			ww = container_of(lock, struct ww_mutex, base); +			/* +			 * If ww->ctx is set the contents are undefined, only +			 * by acquiring wait_lock there is a guarantee that +			 * they are not invalid when reading. +			 * +			 * As such, when deadlock detection needs to be +			 * performed the optimistic spinning cannot be done. +			 */ +			if (ACCESS_ONCE(ww->ctx)) +				break; +		} +  		/*  		 * If there's an owner, wait for it to either  		 * release the lock or go to sleep. @@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		if ((atomic_read(&lock->count) == 1) &&  		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {  			lock_acquired(&lock->dep_map, ip); +			if (!__builtin_constant_p(ww_ctx == NULL)) { +				struct ww_mutex *ww; +				ww = container_of(lock, struct ww_mutex, base); + +				ww_mutex_set_context_fastpath(ww, ww_ctx); +			} +  			mutex_set_owner(lock);  			mspin_unlock(MLOCK(lock), &node);  			preempt_enable(); @@ -371,15 +543,16 @@ slowpath:  		 * TASK_UNINTERRUPTIBLE case.)  		 */  		if (unlikely(signal_pending_state(state, task))) { -			mutex_remove_waiter(lock, &waiter, -					    task_thread_info(task)); -			mutex_release(&lock->dep_map, 1, ip); -			spin_unlock_mutex(&lock->wait_lock, flags); +			ret = -EINTR; +			goto err; +		} -			debug_mutex_free_waiter(&waiter); -			preempt_enable(); -			return -EINTR; +		if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { +			ret = __mutex_lock_check_stamp(lock, ww_ctx); +			if (ret) +				goto err;  		} +  		__set_task_state(task, state);  		/* didn't get the lock, go to sleep: */ @@ -394,6 +567,30 @@ done:  	mutex_remove_waiter(lock, &waiter, current_thread_info());  	mutex_set_owner(lock); +	if (!__builtin_constant_p(ww_ctx == NULL)) { +		struct ww_mutex *ww = container_of(lock, +						      struct ww_mutex, +						      base); +		struct mutex_waiter *cur; + +		/* +		 * This branch gets optimized out for the common case, +		 * and is only important for ww_mutex_lock. +		 */ + +		ww_mutex_lock_acquired(ww, ww_ctx); +		ww->ctx = ww_ctx; + +		/* +		 * Give any possible sleeping processes the chance to wake up, +		 * so they can recheck if they have to back off. +		 */ +		list_for_each_entry(cur, &lock->wait_list, list) { +			debug_mutex_wake_waiter(lock, cur); +			wake_up_process(cur->task); +		} +	} +  	/* set it to 0 if there are no waiters left: */  	if (likely(list_empty(&lock->wait_list)))  		atomic_set(&lock->count, 0); @@ -404,6 +601,14 @@ done:  	preempt_enable();  	return 0; + +err: +	mutex_remove_waiter(lock, &waiter, task_thread_info(task)); +	spin_unlock_mutex(&lock->wait_lock, flags); +	debug_mutex_free_waiter(&waiter); +	mutex_release(&lock->dep_map, 1, ip); +	preempt_enable(); +	return ret;  }  #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -411,7 +616,8 @@ void __sched  mutex_lock_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep(); -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, +			    subclass, NULL, _RET_IP_, NULL);  }  EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -420,7 +626,8 @@ void __sched  _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)  {  	might_sleep(); -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, +			    0, nest, _RET_IP_, NULL);  }  EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -429,7 +636,8 @@ int __sched  mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep(); -	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); +	return __mutex_lock_common(lock, TASK_KILLABLE, +				   subclass, NULL, _RET_IP_, NULL);  }  EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -438,10 +646,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep();  	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, -				   subclass, NULL, _RET_IP_); +				   subclass, NULL, _RET_IP_, NULL);  }  EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH +	unsigned tmp; + +	if (ctx->deadlock_inject_countdown-- == 0) { +		tmp = ctx->deadlock_inject_interval; +		if (tmp > UINT_MAX/4) +			tmp = UINT_MAX; +		else +			tmp = tmp*2 + tmp + tmp/2; + +		ctx->deadlock_inject_interval = tmp; +		ctx->deadlock_inject_countdown = tmp; +		ctx->contending_lock = lock; + +		ww_mutex_unlock(lock); + +		return -EDEADLK; +	} +#endif + +	return 0; +} + +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); +	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, +				   0, &ctx->dep_map, _RET_IP_, ctx); +	if (!ret && ctx->acquired > 0) +		return ww_mutex_deadlock_injection(lock, ctx); + +	return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); +	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, +				  0, &ctx->dep_map, _RET_IP_, ctx); + +	if (!ret && ctx->acquired > 0) +		return ww_mutex_deadlock_injection(lock, ctx); + +	return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); +  #endif  /* @@ -494,10 +760,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)   * mutex_lock_interruptible() and mutex_trylock().   */  static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count); +__mutex_lock_killable_slowpath(struct mutex *lock);  static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count); +__mutex_lock_interruptible_slowpath(struct mutex *lock);  /**   * mutex_lock_interruptible - acquire the mutex, interruptible @@ -515,12 +781,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock)  	int ret;  	might_sleep(); -	ret =  __mutex_fastpath_lock_retval -			(&lock->count, __mutex_lock_interruptible_slowpath); -	if (!ret) +	ret =  __mutex_fastpath_lock_retval(&lock->count); +	if (likely(!ret)) {  		mutex_set_owner(lock); - -	return ret; +		return 0; +	} else +		return __mutex_lock_interruptible_slowpath(lock);  }  EXPORT_SYMBOL(mutex_lock_interruptible); @@ -530,12 +796,12 @@ int __sched mutex_lock_killable(struct mutex *lock)  	int ret;  	might_sleep(); -	ret = __mutex_fastpath_lock_retval -			(&lock->count, __mutex_lock_killable_slowpath); -	if (!ret) +	ret = __mutex_fastpath_lock_retval(&lock->count); +	if (likely(!ret)) {  		mutex_set_owner(lock); - -	return ret; +		return 0; +	} else +		return __mutex_lock_killable_slowpath(lock);  }  EXPORT_SYMBOL(mutex_lock_killable); @@ -544,24 +810,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)  {  	struct mutex *lock = container_of(lock_count, struct mutex, count); -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, +			    NULL, _RET_IP_, NULL);  }  static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count) +__mutex_lock_killable_slowpath(struct mutex *lock)  { -	struct mutex *lock = container_of(lock_count, struct mutex, count); +	return __mutex_lock_common(lock, TASK_KILLABLE, 0, +				   NULL, _RET_IP_, NULL); +} -	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock) +{ +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, +				   NULL, _RET_IP_, NULL);  }  static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count) +__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)  { -	struct mutex *lock = container_of(lock_count, struct mutex, count); +	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, +				   NULL, _RET_IP_, ctx); +} -	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +static noinline int __sched +__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, +					    struct ww_acquire_ctx *ctx) +{ +	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, +				   NULL, _RET_IP_, ctx);  } +  #endif  /* @@ -617,6 +898,45 @@ int __sched mutex_trylock(struct mutex *lock)  }  EXPORT_SYMBOL(mutex_trylock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); + +	ret = __mutex_fastpath_lock_retval(&lock->base.count); + +	if (likely(!ret)) { +		ww_mutex_set_context_fastpath(lock, ctx); +		mutex_set_owner(&lock->base); +	} else +		ret = __ww_mutex_lock_slowpath(lock, ctx); +	return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); + +	ret = __mutex_fastpath_lock_retval(&lock->base.count); + +	if (likely(!ret)) { +		ww_mutex_set_context_fastpath(lock, ctx); +		mutex_set_owner(&lock->base); +	} else +		ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); +	return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock_interruptible); + +#endif +  /**   * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0   * @cnt: the atomic which we are to dec diff --git a/kernel/pid.c b/kernel/pid.c index 0db3e791a06d..66505c1dfc51 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = {  		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }  	},  	.last_pid = 0, +	.nr_hashed = PIDNS_HASH_ADDING,  	.level = 0,  	.child_reaper = &init_task,  	.user_ns = &init_user_ns, @@ -373,14 +374,10 @@ EXPORT_SYMBOL_GPL(find_vpid);  /*   * attach_pid() must be called with the tasklist_lock write-held.   */ -void attach_pid(struct task_struct *task, enum pid_type type, -		struct pid *pid) +void attach_pid(struct task_struct *task, enum pid_type type)  { -	struct pid_link *link; - -	link = &task->pids[type]; -	link->pid = pid; -	hlist_add_head_rcu(&link->node, &pid->tasks[type]); +	struct pid_link *link = &task->pids[type]; +	hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);  }  static void __change_pid(struct task_struct *task, enum pid_type type, @@ -412,7 +409,7 @@ void change_pid(struct task_struct *task, enum pid_type type,  		struct pid *pid)  {  	__change_pid(task, type, pid); -	attach_pid(task, type, pid); +	attach_pid(task, type);  }  /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ @@ -594,7 +591,6 @@ void __init pidmap_init(void)  	/* Reserve PID 0. We never call free_pidmap(0) */  	set_bit(0, init_pid_ns.pidmap[0].page);  	atomic_dec(&init_pid_ns.pidmap[0].nr_free); -	init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;  	init_pid_ns.pid_cachep = KMEM_CACHE(pid,  			SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180b..d444c4e834f4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP  	depends on SMP  	depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE  	depends on PM_SLEEP -	select HOTPLUG  	select HOTPLUG_CPU  config PM_AUTOSLEEP @@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS  	bool  	depends on PM +config WQ_POWER_EFFICIENT_DEFAULT +	bool "Enable workqueue power-efficient mode by default" +	depends on PM +	default n +	help +	  Per-cpu workqueues are generally preferred because they show +	  better performance thanks to cache locality; unfortunately, +	  per-cpu workqueues tend to be more power hungry than unbound +	  workqueues. + +	  Enabling workqueue.power_efficient kernel parameter makes the +	  per-cpu workqueues which were observed to contribute +	  significantly to power consumption unbound, leading to measurably +	  lower power usage at the cost of small performance overhead. + +	  This config option determines whether workqueue.power_efficient +	  is enabled by default. + +	  If in doubt, say N. +  config PM_GENERIC_DOMAINS_SLEEP  	def_bool y  	depends on PM_SLEEP && PM_GENERIC_DOMAINS diff --git a/kernel/power/main.c b/kernel/power/main.c index d77663bfedeb..1d1bf630e6e9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj,  	if (sscanf(buf, "%u", &val) == 1) {  		if (pm_save_wakeup_count(val))  			error = n; +		else +			pm_print_active_wakeup_sources();  	}   out: @@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,  	if (sscanf(buf, "%d", &val) == 1) {  		pm_trace_enabled = !!val; +		if (pm_trace_enabled) { +			pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" +				"PM: Correct system time has to be restored manually after resume.\n"); +		}  		return n;  	}  	return -EINVAL; diff --git a/kernel/power/process.c b/kernel/power/process.c index 98088e0e71e8..fc0df8486449 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only)  	unsigned int todo;  	bool wq_busy = false;  	struct timeval start, end; -	u64 elapsed_csecs64; -	unsigned int elapsed_csecs; +	u64 elapsed_msecs64; +	unsigned int elapsed_msecs;  	bool wakeup = false; +	int sleep_usecs = USEC_PER_MSEC;  	do_gettimeofday(&start); @@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only)  		/*  		 * We need to retry, but first give the freezing tasks some -		 * time to enter the refrigerator. +		 * time to enter the refrigerator.  Start with an initial +		 * 1 ms sleep followed by exponential backoff until 8 ms.  		 */ -		msleep(10); +		usleep_range(sleep_usecs / 2, sleep_usecs); +		if (sleep_usecs < 8 * USEC_PER_MSEC) +			sleep_usecs *= 2;  	}  	do_gettimeofday(&end); -	elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); -	do_div(elapsed_csecs64, NSEC_PER_SEC / 100); -	elapsed_csecs = elapsed_csecs64; +	elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); +	do_div(elapsed_msecs64, NSEC_PER_MSEC); +	elapsed_msecs = elapsed_msecs64;  	if (todo) {  		printk("\n"); -		printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " +		printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "  		       "(%d tasks refusing to freeze, wq_busy=%d):\n",  		       wakeup ? "aborted" : "failed", -		       elapsed_csecs / 100, elapsed_csecs % 100, +		       elapsed_msecs / 1000, elapsed_msecs % 1000,  		       todo - wq_busy, wq_busy);  		if (!wakeup) { @@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only)  			read_unlock(&tasklist_lock);  		}  	} else { -		printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, -			elapsed_csecs % 100); +		printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, +			elapsed_msecs % 1000);  	}  	return todo ? -EBUSY : 0; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 587dddeebf15..06fe28589e9c 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -44,6 +44,7 @@  #include <linux/uaccess.h>  #include <linux/export.h> +#include <trace/events/power.h>  /*   * locking rule: all changes to constraints or notifiers lists @@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,  	spin_unlock_irqrestore(&pm_qos_lock, flags); +	trace_pm_qos_update_target(action, prev_value, curr_value);  	if (prev_value != curr_value) {  		blocking_notifier_call_chain(c->notifiers,  					     (unsigned long)curr_value, @@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,  	spin_unlock_irqrestore(&pm_qos_lock, irqflags); +	trace_pm_qos_update_flags(action, prev_value, curr_value);  	return prev_value != curr_value;  } @@ -333,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req,  	}  	req->pm_qos_class = pm_qos_class;  	INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); +	trace_pm_qos_add_request(pm_qos_class, value);  	pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,  			     &req->node, PM_QOS_ADD_REQ, value);  } @@ -361,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req,  	cancel_delayed_work_sync(&req->work); +	trace_pm_qos_update_request(req->pm_qos_class, new_value);  	if (new_value != req->node.prio)  		pm_qos_update_target(  			pm_qos_array[req->pm_qos_class]->constraints, @@ -387,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,  	cancel_delayed_work_sync(&req->work); +	trace_pm_qos_update_request_timeout(req->pm_qos_class, +					    new_value, timeout_us);  	if (new_value != req->node.prio)  		pm_qos_update_target(  			pm_qos_array[req->pm_qos_class]->constraints, @@ -416,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)  	cancel_delayed_work_sync(&req->work); +	trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);  	pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,  			     &req->node, PM_QOS_REMOVE_REQ,  			     PM_QOS_DEFAULT_VALUE); @@ -477,7 +485,7 @@ static int find_pm_qos_object_by_minor(int minor)  {  	int pm_qos_class; -	for (pm_qos_class = 0; +	for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;  		pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {  		if (minor ==  			pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) @@ -491,7 +499,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)  	long pm_qos_class;  	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); -	if (pm_qos_class >= 0) { +	if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {  		struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);  		if (!req)  			return -ENOMEM; @@ -584,7 +592,7 @@ static int __init pm_qos_power_init(void)  	BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); -	for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { +	for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {  		ret = register_pm_qos_misc(pm_qos_array[i]);  		if (ret < 0) {  			printk(KERN_ERR "pm_qos_param: %s setup failed\n", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807d..349587bb03e1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,  	region->end_pfn = end_pfn;  	list_add_tail(®ion->list, &nosave_regions);   Report: -	printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", -		start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); +	printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", +		(unsigned long long) start_pfn << PAGE_SHIFT, +		((unsigned long long) end_pfn << PAGE_SHIFT) - 1);  }  /* @@ -1651,7 +1652,7 @@ unsigned long snapshot_get_image_size(void)  static int init_header(struct swsusp_info *info)  {  	memset(info, 0, sizeof(struct swsusp_info)); -	info->num_physpages = num_physpages; +	info->num_physpages = get_num_physpages();  	info->image_pages = nr_copy_pages;  	info->pages = snapshot_get_image_size();  	info->size = info->pages; @@ -1795,7 +1796,7 @@ static int check_header(struct swsusp_info *info)  	char *reason;  	reason = check_image_kernel(info); -	if (!reason && info->num_physpages != num_physpages) +	if (!reason && info->num_physpages != get_num_physpages())  		reason = "memory size";  	if (reason) {  		printk(KERN_ERR "PM: Image mismatch: %s\n", reason); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bef86d121eb2..ece04223bb1e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state)  	suspend_test_start();  	error = dpm_suspend_start(PMSG_SUSPEND);  	if (error) { -		printk(KERN_ERR "PM: Some devices failed to suspend\n"); +		pr_err("PM: Some devices failed to suspend, or early wake event detected\n");  		goto Recover_platform;  	}  	suspend_test_finish("suspend devices"); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..ba5e6cea181a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,  		if (unlikely(is_compat_task())) {  			compat_siginfo_t __user *uinfo = compat_ptr(data); -			ret = copy_siginfo_to_user32(uinfo, &info); -			ret |= __put_user(info.si_code, &uinfo->si_code); +			if (copy_siginfo_to_user32(uinfo, &info) || +			    __put_user(info.si_code, &uinfo->si_code)) { +				ret = -EFAULT; +				break; +			} +  		} else  #endif  		{  			siginfo_t __user *uinfo = (siginfo_t __user *) data; -			ret = copy_siginfo_to_user(uinfo, &info); -			ret |= __put_user(info.si_code, &uinfo->si_code); -		} - -		if (ret) { -			ret = -EFAULT; -			break; +			if (copy_siginfo_to_user(uinfo, &info) || +			    __put_user(info.si_code, &uinfo->si_code)) { +				ret = -EFAULT; +				break; +			}  		}  		data += sizeof(siginfo_t); @@ -842,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request,  			ret = ptrace_setsiginfo(child, &siginfo);  		break; +	case PTRACE_GETSIGMASK: +		if (addr != sizeof(sigset_t)) { +			ret = -EINVAL; +			break; +		} + +		if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) +			ret = -EFAULT; +		else +			ret = 0; + +		break; + +	case PTRACE_SETSIGMASK: { +		sigset_t new_set; + +		if (addr != sizeof(sigset_t)) { +			ret = -EINVAL; +			break; +		} + +		if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) { +			ret = -EFAULT; +			break; +		} + +		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + +		/* +		 * Every thread does recalc_sigpending() after resume, so +		 * retarget_shared_pending() and recalc_sigpending() are not +		 * called here. +		 */ +		spin_lock_irq(&child->sighand->siglock); +		child->blocked = new_set; +		spin_unlock_irq(&child->sighand->siglock); + +		ret = 0; +		break; +	} +  	case PTRACE_INTERRUPT:  		/*  		 * Stop tracee without any side-effect on signal or job @@ -946,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request,  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK  	case PTRACE_GETREGSET: -	case PTRACE_SETREGSET: -	{ +	case PTRACE_SETREGSET: {  		struct iovec kiov;  		struct iovec __user *uiov = datavp; diff --git a/kernel/range.c b/kernel/range.c index eb911dbce267..322ea8e93e4b 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@  #include <linux/kernel.h>  #include <linux/init.h>  #include <linux/sort.h> - +#include <linux/string.h>  #include <linux/range.h>  int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,  	if (start >= end)  		return nr_range; -	/* Try to merge it with old one: */ +	/* get new start/end: */  	for (i = 0; i < nr_range; i++) { -		u64 final_start, final_end;  		u64 common_start, common_end;  		if (!range[i].end) @@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,  		if (common_start > common_end)  			continue; -		final_start = min(range[i].start, start); -		final_end = max(range[i].end, end); +		/* new start/end, will add it back at last */ +		start = min(range[i].start, start); +		end = max(range[i].end, end); -		/* clear it and add it back for further merge */ -		range[i].start = 0; -		range[i].end =  0; -		return add_range_with_merge(range, az, nr_range, -			final_start, final_end); +		memmove(&range[i], &range[i + 1], +			(nr_range - (i + 1)) * sizeof(range[i])); +		range[nr_range - 1].start = 0; +		range[nr_range - 1].end   = 0; +		nr_range--; +		i--;  	}  	/* Need to add it: */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4c..cce6ba8bbace 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -104,31 +104,7 @@ void __rcu_read_unlock(void)  }  EXPORT_SYMBOL_GPL(__rcu_read_unlock); -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so.  No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ -	struct task_struct *t = current; - -	if (likely(list_empty(¤t->rcu_node_entry))) -		return; -	t->rcu_read_lock_nesting = 1; -	barrier(); -	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; -	__rcu_read_unlock(); -} - -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */  #ifdef CONFIG_DEBUG_LOCK_ALLOC  static struct lock_class_key rcu_lock_key; @@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key;  struct lockdep_map rcu_sched_lock_map =  	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);  EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC  int debug_lockdep_rcu_enabled(void)  { diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index a0714a51b6d7..aa344111de3e 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -44,7 +44,6 @@  /* Forward declarations for rcutiny_plugin.h. */  struct rcu_ctrlblk; -static void invoke_rcu_callbacks(void);  static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);  static void rcu_process_callbacks(struct softirq_action *unused);  static void __call_rcu(struct rcu_head *head, @@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)   */  static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)  { -	reset_cpu_stall_ticks(rcp); +	RCU_TRACE(reset_cpu_stall_ticks(rcp));  	if (rcp->rcucblist != NULL &&  	    rcp->donetail != rcp->curtail) {  		rcp->donetail = rcp->curtail; @@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu)  	local_irq_save(flags);  	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +  	    rcu_qsctr_help(&rcu_bh_ctrlblk)) -		invoke_rcu_callbacks(); +		raise_softirq(RCU_SOFTIRQ);  	local_irq_restore(flags);  } @@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu)  	local_irq_save(flags);  	if (rcu_qsctr_help(&rcu_bh_ctrlblk)) -		invoke_rcu_callbacks(); +		raise_softirq(RCU_SOFTIRQ);  	local_irq_restore(flags);  } @@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu)   */  void rcu_check_callbacks(int cpu, int user)  { -	check_cpu_stalls(); +	RCU_TRACE(check_cpu_stalls());  	if (user || rcu_is_cpu_rrupt_from_idle())  		rcu_sched_qs(cpu);  	else if (!in_softirq())  		rcu_bh_qs(cpu); -	rcu_preempt_check_callbacks();  }  /* @@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  					      ACCESS_ONCE(rcp->rcucblist),  					      need_resched(),  					      is_idle_task(current), -					      rcu_is_callbacks_kthread())); +					      false));  		return;  	} @@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  	*rcp->donetail = NULL;  	if (rcp->curtail == rcp->donetail)  		rcp->curtail = &rcp->rcucblist; -	rcu_preempt_remove_callbacks(rcp);  	rcp->donetail = &rcp->rcucblist;  	local_irq_restore(flags); @@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));  	RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),  				      is_idle_task(current), -				      rcu_is_callbacks_kthread())); +				      false));  }  static void rcu_process_callbacks(struct softirq_action *unused)  {  	__rcu_process_callbacks(&rcu_sched_ctrlblk);  	__rcu_process_callbacks(&rcu_bh_ctrlblk); -	rcu_preempt_process_callbacks();  }  /* @@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  	__call_rcu(head, func, &rcu_bh_ctrlblk);  }  EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_init(void) +{ +	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8a233002faeb..0cd385acccfa 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {  };  #ifdef CONFIG_DEBUG_LOCK_ALLOC +#include <linux/kernel_stat.h> +  int rcu_scheduler_active __read_mostly;  EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ -	unsigned long j; -	unsigned long js; - -	if (rcu_cpu_stall_suppress) -		return; -	rcp->ticks_this_gp++; -	j = jiffies; -	js = rcp->jiffies_stall; -	if (*rcp->curtail && ULONG_CMP_GE(j, js)) { -		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", -		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, -		       jiffies - rcp->gp_start, rcp->qlen); -		dump_stack(); -	} -	if (*rcp->curtail && ULONG_CMP_GE(j, js)) -		rcp->jiffies_stall = jiffies + -			3 * rcu_jiffies_till_stall_check() + 3; -	else if (ULONG_CMP_GE(j, js)) -		rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -static void check_cpu_stall_preempt(void); - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ -#ifdef CONFIG_RCU_TRACE -	rcp->ticks_this_gp = 0; -	rcp->gp_start = jiffies; -	rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ -} - -static void check_cpu_stalls(void) -{ -	RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); -	RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); -	RCU_TRACE(check_cpu_stall_preempt()); -} - -#ifdef CONFIG_TINY_PREEMPT_RCU - -#include <linux/delay.h> - -/* Global control variables for preemptible RCU. */ -struct rcu_preempt_ctrlblk { -	struct rcu_ctrlblk rcb;	/* curtail: ->next ptr of last CB for GP. */ -	struct rcu_head **nexttail; -				/* Tasks blocked in a preemptible RCU */ -				/*  read-side critical section while an */ -				/*  preemptible-RCU grace period is in */ -				/*  progress must wait for a later grace */ -				/*  period.  This pointer points to the */ -				/*  ->next pointer of the last task that */ -				/*  must wait for a later grace period, or */ -				/*  to &->rcb.rcucblist if there is no */ -				/*  such task. */ -	struct list_head blkd_tasks; -				/* Tasks blocked in RCU read-side critical */ -				/*  section.  Tasks are placed at the head */ -				/*  of this list and age towards the tail. */ -	struct list_head *gp_tasks; -				/* Pointer to the first task blocking the */ -				/*  current grace period, or NULL if there */ -				/*  is no such task. */ -	struct list_head *exp_tasks; -				/* Pointer to first task blocking the */ -				/*  current expedited grace period, or NULL */ -				/*  if there is no such task.  If there */ -				/*  is no current expedited grace period, */ -				/*  then there cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST -	struct list_head *boost_tasks; -				/* Pointer to first task that needs to be */ -				/*  priority-boosted, or NULL if no priority */ -				/*  boosting is needed.  If there is no */ -				/*  current or expedited grace period, there */ -				/*  can be no such task. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -	u8 gpnum;		/* Current grace period. */ -	u8 gpcpu;		/* Last grace period blocked by the CPU. */ -	u8 completed;		/* Last grace period completed. */ -				/*  If all three are equal, RCU is idle. */ -#ifdef CONFIG_RCU_BOOST -	unsigned long boost_time; /* When to start boosting (jiffies) */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#ifdef CONFIG_RCU_TRACE -	unsigned long n_grace_periods; -#ifdef CONFIG_RCU_BOOST -	unsigned long n_tasks_boosted; -				/* Total number of tasks boosted. */ -	unsigned long n_exp_boosts; -				/* Number of tasks boosted for expedited GP. */ -	unsigned long n_normal_boosts; -				/* Number of tasks boosted for normal GP. */ -	unsigned long n_balk_blkd_tasks; -				/* Refused to boost: no blocked tasks. */ -	unsigned long n_balk_exp_gp_tasks; -				/* Refused to boost: nothing blocking GP. */ -	unsigned long n_balk_boost_tasks; -				/* Refused to boost: already boosting. */ -	unsigned long n_balk_notyet; -				/* Refused to boost: not yet time. */ -	unsigned long n_balk_nos; -				/* Refused to boost: not sure why, though. */ -				/*  This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { -	.rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, -	.rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, -	.nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, -	.blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), -	RCU_TRACE(.rcb.name = "rcu_preempt") -}; - -static int rcu_preempted_readers_exp(void); -static void rcu_report_exp_done(void); - -/* - * Return true if the CPU has not yet responded to the current grace period. - */ -static int rcu_cpu_blocking_cur_gp(void) -{ -	return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Check for a running RCU reader.  Because there is only one CPU, - * there can be but one running RCU reader at a time.  ;-) - * - * Returns zero if there are no running readers.  Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section.  Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section - * - * Returns zero if there are no running readers.  Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section.  Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section. - */ -static int rcu_preempt_running_reader(void) -{ -	return current->rcu_read_lock_nesting; -} - -/* - * Check for preempted RCU readers blocking any grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_any(void) -{ -	return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); -} - -/* - * Check for preempted RCU readers blocking the current grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_cgp(void) -{ -	return rcu_preempt_ctrlblk.gp_tasks != NULL; -} - -/* - * Return true if another preemptible-RCU grace period is needed. - */ -static int rcu_preempt_needs_another_gp(void) -{ -	return *rcu_preempt_ctrlblk.rcb.curtail != NULL; -} - -/* - * Return true if a preemptible-RCU grace period is in progress. - * The caller must disable hardirqs. - */ -static int rcu_preempt_gp_in_progress(void) -{ -	return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Advance a ->blkd_tasks-list pointer to the next entry, instead - * returning NULL if at the end of the list. - */ -static struct list_head *rcu_next_node_entry(struct task_struct *t) -{ -	struct list_head *np; - -	np = t->rcu_node_entry.next; -	if (np == &rcu_preempt_ctrlblk.blkd_tasks) -		np = NULL; -	return np; -} - -#ifdef CONFIG_RCU_TRACE - -#ifdef CONFIG_RCU_BOOST -static void rcu_initiate_boost_trace(void); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Dump additional statistice for TINY_PREEMPT_RCU. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ -	seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", -		   rcu_preempt_ctrlblk.rcb.qlen, -		   rcu_preempt_ctrlblk.n_grace_periods, -		   rcu_preempt_ctrlblk.gpnum, -		   rcu_preempt_ctrlblk.gpcpu, -		   rcu_preempt_ctrlblk.completed, -		   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], -		   "N."[!rcu_preempt_ctrlblk.gp_tasks], -		   "E."[!rcu_preempt_ctrlblk.exp_tasks]); -#ifdef CONFIG_RCU_BOOST -	seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", -		   "             ", -		   "B."[!rcu_preempt_ctrlblk.boost_tasks], -		   rcu_preempt_ctrlblk.n_tasks_boosted, -		   rcu_preempt_ctrlblk.n_exp_boosts, -		   rcu_preempt_ctrlblk.n_normal_boosts, -		   (int)(jiffies & 0xffff), -		   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); -	seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", -		   "             balk", -		   rcu_preempt_ctrlblk.n_balk_blkd_tasks, -		   rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, -		   rcu_preempt_ctrlblk.n_balk_boost_tasks, -		   rcu_preempt_ctrlblk.n_balk_notyet, -		   rcu_preempt_ctrlblk.n_balk_nos); -#endif /* #ifdef CONFIG_RCU_BOOST */ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -#ifdef CONFIG_RCU_BOOST - -#include "rtmutex_common.h" - -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO - -/* Controls for rcu_kthread() kthread. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; - -/* - * Carry out RCU priority boosting on the task indicated by ->boost_tasks, - * and advance ->boost_tasks to the next task in the ->blkd_tasks list. - */ -static int rcu_boost(void) -{ -	unsigned long flags; -	struct rt_mutex mtx; -	struct task_struct *t; -	struct list_head *tb; - -	if (rcu_preempt_ctrlblk.boost_tasks == NULL && -	    rcu_preempt_ctrlblk.exp_tasks == NULL) -		return 0;  /* Nothing to boost. */ - -	local_irq_save(flags); - -	/* -	 * Recheck with irqs disabled: all tasks in need of boosting -	 * might exit their RCU read-side critical sections on their own -	 * if we are preempted just before disabling irqs. -	 */ -	if (rcu_preempt_ctrlblk.boost_tasks == NULL && -	    rcu_preempt_ctrlblk.exp_tasks == NULL) { -		local_irq_restore(flags); -		return 0; -	} - -	/* -	 * Preferentially boost tasks blocking expedited grace periods. -	 * This cannot starve the normal grace periods because a second -	 * expedited grace period must boost all blocked tasks, including -	 * those blocking the pre-existing normal grace period. -	 */ -	if (rcu_preempt_ctrlblk.exp_tasks != NULL) { -		tb = rcu_preempt_ctrlblk.exp_tasks; -		RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); -	} else { -		tb = rcu_preempt_ctrlblk.boost_tasks; -		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); -	} -	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - -	/* -	 * We boost task t by manufacturing an rt_mutex that appears to -	 * be held by task t.  We leave a pointer to that rt_mutex where -	 * task t can find it, and task t will release the mutex when it -	 * exits its outermost RCU read-side critical section.  Then -	 * simply acquiring this artificial rt_mutex will boost task -	 * t's priority.  (Thanks to tglx for suggesting this approach!) -	 */ -	t = container_of(tb, struct task_struct, rcu_node_entry); -	rt_mutex_init_proxy_locked(&mtx, t); -	t->rcu_boost_mutex = &mtx; -	local_irq_restore(flags); -	rt_mutex_lock(&mtx); -	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */ - -	return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || -	       ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; -} - -/* - * Check to see if it is now time to start boosting RCU readers blocking - * the current grace period, and, if so, tell the rcu_kthread_task to - * start boosting them.  If there is an expedited boost in progress, - * we wait for it to complete. - * - * If there are no blocked readers blocking the current grace period, - * return 0 to let the caller know, otherwise return 1.  Note that this - * return value is independent of whether or not boosting was done. - */ -static int rcu_initiate_boost(void) -{ -	if (!rcu_preempt_blocked_readers_cgp() && -	    rcu_preempt_ctrlblk.exp_tasks == NULL) { -		RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); -		return 0; -	} -	if (rcu_preempt_ctrlblk.exp_tasks != NULL || -	    (rcu_preempt_ctrlblk.gp_tasks != NULL && -	     rcu_preempt_ctrlblk.boost_tasks == NULL && -	     ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { -		if (rcu_preempt_ctrlblk.exp_tasks == NULL) -			rcu_preempt_ctrlblk.boost_tasks = -				rcu_preempt_ctrlblk.gp_tasks; -		invoke_rcu_callbacks(); -	} else { -		RCU_TRACE(rcu_initiate_boost_trace()); -	} -	return 1; -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) - -/* - * Do priority-boost accounting for the start of a new grace period. - */ -static void rcu_preempt_boost_start_gp(void) -{ -	rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * If there is no RCU priority boosting, we don't initiate boosting, - * but we do indicate whether there are blocked readers blocking the - * current grace period. - */ -static int rcu_initiate_boost(void) -{ -	return rcu_preempt_blocked_readers_cgp(); -} - -/* - * If there is no RCU priority boosting, nothing to do at grace-period start. - */ -static void rcu_preempt_boost_start_gp(void) -{ -} - -#endif /* else #ifdef CONFIG_RCU_BOOST */ - -/* - * Record a preemptible-RCU quiescent state for the specified CPU.  Note - * that this just means that the task currently running on the CPU is - * in a quiescent state.  There might be any number of tasks blocked - * while in an RCU read-side critical section. - * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - * - * Because this is a single-CPU implementation, the only way a grace - * period can end is if the CPU is in a quiescent state.  The reason is - * that a blocked preemptible-RCU reader can exit its critical section - * only if the CPU is running it at the time.  Therefore, when the - * last task blocking the current grace period exits its RCU read-side - * critical section, neither the CPU nor blocked tasks will be stopping - * the current grace period.  (In contrast, SMP implementations - * might have CPUs running in RCU read-side critical sections that - * block later grace periods -- but this is not possible given only - * one CPU.) - */ -static void rcu_preempt_cpu_qs(void) -{ -	/* Record both CPU and task as having responded to current GP. */ -	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; -	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - -	/* If there is no GP then there is nothing more to do.  */ -	if (!rcu_preempt_gp_in_progress()) -		return; -	/* -	 * Check up on boosting.  If there are readers blocking the -	 * current grace period, leave. -	 */ -	if (rcu_initiate_boost()) -		return; - -	/* Advance callbacks. */ -	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; -	rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; -	rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; - -	/* If there are no blocked readers, next GP is done instantly. */ -	if (!rcu_preempt_blocked_readers_any()) -		rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - -	/* If there are done callbacks, cause them to be invoked. */ -	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) -		invoke_rcu_callbacks(); -} - -/* - * Start a new RCU grace period if warranted.  Hard irqs must be disabled. - */ -static void rcu_preempt_start_gp(void) -{ -	if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { - -		/* Official start of GP. */ -		rcu_preempt_ctrlblk.gpnum++; -		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); -		reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); - -		/* Any blocked RCU readers block new GP. */ -		if (rcu_preempt_blocked_readers_any()) -			rcu_preempt_ctrlblk.gp_tasks = -				rcu_preempt_ctrlblk.blkd_tasks.next; - -		/* Set up for RCU priority boosting. */ -		rcu_preempt_boost_start_gp(); - -		/* If there is no running reader, CPU is done with GP. */ -		if (!rcu_preempt_running_reader()) -			rcu_preempt_cpu_qs(); -	} -} - -/* - * We have entered the scheduler, and the current task might soon be - * context-switched away from.  If this task is in an RCU read-side - * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the blkd_tasks list. - * If the task started after the current grace period began, as recorded - * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise - * before the element referenced by ->gp_tasks (or at the tail if - * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. - * The task will dequeue itself when it exits the outermost enclosing - * RCU read-side critical section.  Therefore, the current grace period - * cannot be permitted to complete until the ->gp_tasks pointer becomes - * NULL. - * - * Caller must disable preemption. - */ -void rcu_preempt_note_context_switch(void) -{ -	struct task_struct *t = current; -	unsigned long flags; - -	local_irq_save(flags); /* must exclude scheduler_tick(). */ -	if (rcu_preempt_running_reader() > 0 && -	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { - -		/* Possibly blocking in an RCU read-side critical section. */ -		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - -		/* -		 * If this CPU has already checked in, then this task -		 * will hold up the next grace period rather than the -		 * current grace period.  Queue the task accordingly. -		 * If the task is queued for the current grace period -		 * (i.e., this CPU has not yet passed through a quiescent -		 * state for the current grace period), then as long -		 * as that task remains queued, the current grace period -		 * cannot end. -		 */ -		list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); -		if (rcu_cpu_blocking_cur_gp()) -			rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; -	} else if (rcu_preempt_running_reader() < 0 && -		   t->rcu_read_unlock_special) { -		/* -		 * Complete exit from RCU read-side critical section on -		 * behalf of preempted instance of __rcu_read_unlock(). -		 */ -		rcu_read_unlock_special(t); -	} - -	/* -	 * Either we were not in an RCU read-side critical section to -	 * begin with, or we have now recorded that critical section -	 * globally.  Either way, we can now note a quiescent state -	 * for this CPU.  Again, if we were in an RCU read-side critical -	 * section, and if that critical section was blocking the current -	 * grace period, then the fact that the task has been enqueued -	 * means that current grace period continues to be blocked. -	 */ -	rcu_preempt_cpu_qs(); -	local_irq_restore(flags); -} - -/* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. - */ -void rcu_read_unlock_special(struct task_struct *t) -{ -	int empty; -	int empty_exp; -	unsigned long flags; -	struct list_head *np; -#ifdef CONFIG_RCU_BOOST -	struct rt_mutex *rbmp = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ -	int special; - -	/* -	 * NMI handlers cannot block and cannot safely manipulate state. -	 * They therefore cannot possibly be special, so just leave. -	 */ -	if (in_nmi()) -		return; - -	local_irq_save(flags); - -	/* -	 * If RCU core is waiting for this CPU to exit critical section, -	 * let it know that we have done so. -	 */ -	special = t->rcu_read_unlock_special; -	if (special & RCU_READ_UNLOCK_NEED_QS) -		rcu_preempt_cpu_qs(); - -	/* Hardware IRQ handlers cannot block. */ -	if (in_irq() || in_serving_softirq()) { -		local_irq_restore(flags); -		return; -	} - -	/* Clean up if blocked during RCU read-side critical section. */ -	if (special & RCU_READ_UNLOCK_BLOCKED) { -		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - -		/* -		 * Remove this task from the ->blkd_tasks list and adjust -		 * any pointers that might have been referencing it. -		 */ -		empty = !rcu_preempt_blocked_readers_cgp(); -		empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; -		np = rcu_next_node_entry(t); -		list_del_init(&t->rcu_node_entry); -		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) -			rcu_preempt_ctrlblk.gp_tasks = np; -		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) -			rcu_preempt_ctrlblk.exp_tasks = np; -#ifdef CONFIG_RCU_BOOST -		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) -			rcu_preempt_ctrlblk.boost_tasks = np; -#endif /* #ifdef CONFIG_RCU_BOOST */ - -		/* -		 * If this was the last task on the current list, and if -		 * we aren't waiting on the CPU, report the quiescent state -		 * and start a new grace period if needed. -		 */ -		if (!empty && !rcu_preempt_blocked_readers_cgp()) { -			rcu_preempt_cpu_qs(); -			rcu_preempt_start_gp(); -		} - -		/* -		 * If this was the last task on the expedited lists, -		 * then we need wake up the waiting task. -		 */ -		if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) -			rcu_report_exp_done(); -	} -#ifdef CONFIG_RCU_BOOST -	/* Unboost self if was boosted. */ -	if (t->rcu_boost_mutex != NULL) { -		rbmp = t->rcu_boost_mutex; -		t->rcu_boost_mutex = NULL; -		rt_mutex_unlock(rbmp); -	} -#endif /* #ifdef CONFIG_RCU_BOOST */ -	local_irq_restore(flags); -} - -/* - * Check for a quiescent state from the current CPU.  When a task blocks, - * the task is recorded in the rcu_preempt_ctrlblk structure, which is - * checked elsewhere.  This is called from the scheduling-clock interrupt. - * - * Caller must disable hard irqs. - */ -static void rcu_preempt_check_callbacks(void) -{ -	struct task_struct *t = current; - -	if (rcu_preempt_gp_in_progress() && -	    (!rcu_preempt_running_reader() || -	     !rcu_cpu_blocking_cur_gp())) -		rcu_preempt_cpu_qs(); -	if (&rcu_preempt_ctrlblk.rcb.rcucblist != -	    rcu_preempt_ctrlblk.rcb.donetail) -		invoke_rcu_callbacks(); -	if (rcu_preempt_gp_in_progress() && -	    rcu_cpu_blocking_cur_gp() && -	    rcu_preempt_running_reader() > 0) -		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; -} - -/* - * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from rcu_process_callbacks() to - * handle that case.  Of course, it is invoked for all flavors of - * RCU, but RCU callbacks can appear only on one of the lists, and - * neither ->nexttail nor ->donetail can possibly be NULL, so there - * is no need for an explicit check. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ -	if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) -		rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; -} - -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ -	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); -} - -/* - * Queue a preemptible -RCU callback for invocation after a grace period. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ -	unsigned long flags; - -	debug_rcu_head_queue(head); -	head->func = func; -	head->next = NULL; - -	local_irq_save(flags); -	*rcu_preempt_ctrlblk.nexttail = head; -	rcu_preempt_ctrlblk.nexttail = &head->next; -	RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); -	rcu_preempt_start_gp();  /* checks to see if GP needed. */ -	local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed.  RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ -	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && -			   !lock_is_held(&rcu_lock_map) && -			   !lock_is_held(&rcu_sched_lock_map), -			   "Illegal synchronize_rcu() in RCU read-side critical section"); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -	if (!rcu_scheduler_active) -		return; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -	WARN_ON_ONCE(rcu_preempt_running_reader()); -	if (!rcu_preempt_blocked_readers_any()) -		return; - -	/* Once we get past the fastpath checks, same code as rcu_barrier(). */ -	if (rcu_expedited) -		synchronize_rcu_expedited(); -	else -		rcu_barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(void) -{ -	return rcu_preempt_ctrlblk.exp_tasks != NULL; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. - */ -static void rcu_report_exp_done(void) -{ -	wake_up(&sync_rcu_preempt_exp_wq); -} - -/* - * Wait for an rcu-preempt grace period, but expedite it.  The basic idea - * is to rely in the fact that there is but one CPU, and that it is - * illegal for a task to invoke synchronize_rcu_expedited() while in a - * preemptible-RCU read-side critical section.  Therefore, any such - * critical sections must correspond to blocked tasks, which must therefore - * be on the ->blkd_tasks list.  So just record the current head of the - * list in the ->exp_tasks pointer, and wait for all tasks including and - * after the task pointed to by ->exp_tasks to drain. - */ -void synchronize_rcu_expedited(void) -{ -	unsigned long flags; -	struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; -	unsigned long snap; - -	barrier(); /* ensure prior action seen before grace period. */ - -	WARN_ON_ONCE(rcu_preempt_running_reader()); - -	/* -	 * Acquire lock so that there is only one preemptible RCU grace -	 * period in flight.  Of course, if someone does the expedited -	 * grace period for us while we are acquiring the lock, just leave. -	 */ -	snap = sync_rcu_preempt_exp_count + 1; -	mutex_lock(&sync_rcu_preempt_exp_mutex); -	if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) -		goto unlock_mb_ret; /* Others did our work for us. */ - -	local_irq_save(flags); - -	/* -	 * All RCU readers have to already be on blkd_tasks because -	 * we cannot legally be executing in an RCU read-side critical -	 * section. -	 */ - -	/* Snapshot current head of ->blkd_tasks list. */ -	rpcp->exp_tasks = rpcp->blkd_tasks.next; -	if (rpcp->exp_tasks == &rpcp->blkd_tasks) -		rpcp->exp_tasks = NULL; - -	/* Wait for tail of ->blkd_tasks list to drain. */ -	if (!rcu_preempted_readers_exp()) { -		local_irq_restore(flags); -	} else { -		rcu_initiate_boost(); -		local_irq_restore(flags); -		wait_event(sync_rcu_preempt_exp_wq, -			   !rcu_preempted_readers_exp()); -	} - -	/* Clean up and exit. */ -	barrier(); /* ensure expedited GP seen before counter increment. */ -	sync_rcu_preempt_exp_count++; -unlock_mb_ret: -	mutex_unlock(&sync_rcu_preempt_exp_mutex); -	barrier(); /* ensure subsequent action seen after grace period. */ -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* - * Does preemptible RCU need the CPU to stay out of dynticks mode? - */ -int rcu_preempt_needs_cpu(void) -{ -	return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; -} - -#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_TRACE - -/* - * Because preemptible RCU does not exist, it is not necessary to - * dump out its statistics. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. - */ -static void rcu_preempt_check_callbacks(void) -{ -} - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to remove. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ -} - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - -#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST - -/* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_callbacks(void) -{ -	have_rcu_kthread_work = 1; -	if (rcu_kthread_task != NULL) -		wake_up(&rcu_kthread_wq); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ -	return rcu_kthread_task == current; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed.  It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) -{ -	unsigned long work; -	unsigned long morework; -	unsigned long flags; - -	for (;;) { -		wait_event_interruptible(rcu_kthread_wq, -					 have_rcu_kthread_work != 0); -		morework = rcu_boost(); -		local_irq_save(flags); -		work = have_rcu_kthread_work; -		have_rcu_kthread_work = morework; -		local_irq_restore(flags); -		if (work) -			rcu_process_callbacks(NULL); -		schedule_timeout_interruptible(1); /* Leave CPU for others. */ -	} - -	return 0;  /* Not reached, but needed to shut gcc up. */ -} - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ -	struct sched_param sp; - -	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); -	sp.sched_priority = RCU_BOOST_PRIO; -	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); -	return 0; -} -early_initcall(rcu_spawn_kthreads); - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* Hold off callback invocation until early_initcall() time. */ -static int rcu_scheduler_fully_active __read_mostly; - -/* - * Start up softirq processing of callbacks. - */ -void invoke_rcu_callbacks(void) -{ -	if (rcu_scheduler_fully_active) -		raise_softirq(RCU_SOFTIRQ); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * There is no callback kthread, so this thread is never it. - */ -static bool rcu_is_callbacks_kthread(void) -{ -	return false; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static int __init rcu_scheduler_really_started(void) -{ -	rcu_scheduler_fully_active = 1; -	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -	raise_softirq(RCU_SOFTIRQ);  /* Invoke any callbacks from early boot. */ -	return 0; -} -early_initcall(rcu_scheduler_really_started); - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#include <linux/kernel_stat.h>  /*   * During boot, we forgive RCU lockdep issues.  After this function is @@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void)  #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_BOOST - -static void rcu_initiate_boost_trace(void) -{ -	if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) -		rcu_preempt_ctrlblk.n_balk_blkd_tasks++; -	else if (rcu_preempt_ctrlblk.gp_tasks == NULL && -		 rcu_preempt_ctrlblk.exp_tasks == NULL) -		rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; -	else if (rcu_preempt_ctrlblk.boost_tasks != NULL) -		rcu_preempt_ctrlblk.n_balk_boost_tasks++; -	else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) -		rcu_preempt_ctrlblk.n_balk_notyet++; -	else -		rcu_preempt_ctrlblk.n_balk_nos++; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ -  static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)  {  	unsigned long flags; @@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)   */  static int show_tiny_stats(struct seq_file *m, void *unused)  { -	show_tiny_preempt_stats(m);  	seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);  	seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);  	return 0; @@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney");  MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");  MODULE_LICENSE("GPL"); -static void check_cpu_stall_preempt(void) +static void check_cpu_stall(struct rcu_ctrlblk *rcp)  { -#ifdef CONFIG_TINY_PREEMPT_RCU -	check_cpu_stall(&rcu_preempt_ctrlblk.rcb); -#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +	unsigned long j; +	unsigned long js; + +	if (rcu_cpu_stall_suppress) +		return; +	rcp->ticks_this_gp++; +	j = jiffies; +	js = rcp->jiffies_stall; +	if (*rcp->curtail && ULONG_CMP_GE(j, js)) { +		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", +		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, +		       jiffies - rcp->gp_start, rcp->qlen); +		dump_stack(); +	} +	if (*rcp->curtail && ULONG_CMP_GE(j, js)) +		rcp->jiffies_stall = jiffies + +			3 * rcu_jiffies_till_stall_check() + 3; +	else if (ULONG_CMP_GE(j, js)) +		rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +	rcp->ticks_this_gp = 0; +	rcp->gp_start = jiffies; +	rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stalls(void) +{ +	RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); +	RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));  }  #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e1f3a8c96724..b1fa5510388d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = {  	.name		= "srcu_sync"  }; -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ -	return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ -	srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { -	.init		= rcu_sync_torture_init, -	.readlock	= srcu_torture_read_lock_raw, -	.read_delay	= srcu_read_delay, -	.readunlock	= srcu_torture_read_unlock_raw, -	.completed	= srcu_torture_completed, -	.deferred_free	= srcu_torture_deferred_free, -	.sync		= srcu_torture_synchronize, -	.call		= NULL, -	.cb_barrier	= NULL, -	.stats		= srcu_torture_stats, -	.name		= "srcu_raw" -}; - -static struct rcu_torture_ops srcu_raw_sync_ops = { -	.init		= rcu_sync_torture_init, -	.readlock	= srcu_torture_read_lock_raw, -	.read_delay	= srcu_read_delay, -	.readunlock	= srcu_torture_read_unlock_raw, -	.completed	= srcu_torture_completed, -	.deferred_free	= rcu_sync_torture_deferred_free, -	.sync		= srcu_torture_synchronize, -	.call		= NULL, -	.cb_barrier	= NULL, -	.stats		= srcu_torture_stats, -	.name		= "srcu_raw_sync" -}; -  static void srcu_torture_synchronize_expedited(void)  {  	synchronize_srcu_expedited(&srcu_ctl); @@ -1983,7 +1945,6 @@ rcu_torture_init(void)  		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,  		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,  		  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, -		  &srcu_raw_ops, &srcu_raw_sync_ops,  		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };  	mutex_lock(&fullstop_mutex); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 35380019f0fc..e08abb9461ac 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -218,8 +218,8 @@ module_param(blimit, long, 0444);  module_param(qhimark, long, 0444);  module_param(qlowmark, long, 0444); -static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; -static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_next_fqs = ULONG_MAX;  module_param(jiffies_till_first_fqs, ulong, 0644);  module_param(jiffies_till_next_fqs, ulong, 0644); @@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	 * See Documentation/RCU/stallwarn.txt for info on how to debug  	 * RCU CPU stall warnings.  	 */ -	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", +	pr_err("INFO: %s detected stalls on CPUs/tasks:",  	       rsp->name);  	print_cpu_stall_info_begin();  	rcu_for_each_leaf_node(rsp, rnp) { @@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	       smp_processor_id(), (long)(jiffies - rsp->gp_start),  	       rsp->gpnum, rsp->completed, totqlen);  	if (ndetected == 0) -		printk(KERN_ERR "INFO: Stall ended before state dump start\n"); +		pr_err("INFO: Stall ended before state dump start\n");  	else if (!trigger_all_cpu_backtrace())  		rcu_dump_cpu_stacks(rsp); @@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp)  	 * See Documentation/RCU/stallwarn.txt for info on how to debug  	 * RCU CPU stall warnings.  	 */ -	printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); +	pr_err("INFO: %s self-detected stall on CPU", rsp->name);  	print_cpu_stall_info_begin();  	print_cpu_stall_info(rsp, smp_processor_id());  	print_cpu_stall_info_end(); @@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void)  }  /* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period.  The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - *  and must have irqs disabled. - */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ -	if (rdp->gpnum != rnp->gpnum) { -		/* -		 * If the current grace period is waiting for this CPU, -		 * set up to detect a quiescent state, otherwise don't -		 * go looking for one. -		 */ -		rdp->gpnum = rnp->gpnum; -		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); -		rdp->passed_quiesce = 0; -		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); -		zero_cpu_stall_ticks(rdp); -	} -} - -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) -{ -	unsigned long flags; -	struct rcu_node *rnp; - -	local_irq_save(flags); -	rnp = rdp->mynode; -	if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ -	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ -		local_irq_restore(flags); -		return; -	} -	__note_new_gpnum(rsp, rnp, rdp); -	raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Did someone else start a new RCU grace period start since we last - * checked?  Update local state appropriately if so.  Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ -	unsigned long flags; -	int ret = 0; - -	local_irq_save(flags); -	if (rdp->gpnum != rsp->gpnum) { -		note_new_gpnum(rsp, rdp); -		ret = 1; -	} -	local_irq_restore(flags); -	return ret; -} - -/*   * Initialize the specified rcu_data structure's callback list to empty.   */  static void init_callback_list(struct rcu_data *rdp) @@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  }  /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended.  This may be called only from the CPU to whom the rdp - * belongs.  In addition, the corresponding leaf rcu_node structure's - * ->lock must be held by the caller, with irqs disabled. + * Update CPU-local rcu_data state to record the beginnings and ends of + * grace periods.  The caller must hold the ->lock of the leaf rcu_node + * structure corresponding to the current CPU, and must have irqs disabled.   */ -static void -__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)  { -	/* Did another grace period end? */ +	/* Handle the ends of any preceding grace periods first. */  	if (rdp->completed == rnp->completed) { -		/* No, so just accelerate recent callbacks. */ +		/* No grace period end, so just accelerate recent callbacks. */  		rcu_accelerate_cbs(rsp, rnp, rdp);  	} else { @@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat  		/* Remember that we saw this grace-period completion. */  		rdp->completed = rnp->completed;  		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); +	} +	if (rdp->gpnum != rnp->gpnum) {  		/* -		 * If we were in an extended quiescent state, we may have -		 * missed some grace periods that others CPUs handled on -		 * our behalf. Catch up with this state to avoid noting -		 * spurious new grace periods.  If another grace period -		 * has started, then rnp->gpnum will have advanced, so -		 * we will detect this later on.  Of course, any quiescent -		 * states we found for the old GP are now invalid. -		 */ -		if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { -			rdp->gpnum = rdp->completed; -			rdp->passed_quiesce = 0; -		} - -		/* -		 * If RCU does not need a quiescent state from this CPU, -		 * then make sure that this CPU doesn't go looking for one. +		 * If the current grace period is waiting for this CPU, +		 * set up to detect a quiescent state, otherwise don't +		 * go looking for one.  		 */ -		if ((rnp->qsmask & rdp->grpmask) == 0) -			rdp->qs_pending = 0; +		rdp->gpnum = rnp->gpnum; +		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); +		rdp->passed_quiesce = 0; +		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); +		zero_cpu_stall_ticks(rdp);  	}  } -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended.  This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags;  	struct rcu_node *rnp;  	local_irq_save(flags);  	rnp = rdp->mynode; -	if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ +	if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && +	     rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */  	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */  		local_irq_restore(flags);  		return;  	} -	__rcu_process_gp_end(rsp, rnp, rdp); +	__note_gp_changes(rsp, rnp, rdp);  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  }  /* - * Do per-CPU grace-period initialization for running CPU.  The caller - * must hold the lock of the leaf rcu_node structure corresponding to - * this CPU. - */ -static void -rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ -	/* Prior grace period ended, so advance callbacks for current CPU. */ -	__rcu_process_gp_end(rsp, rnp, rdp); - -	/* Set state so that this CPU will detect the next quiescent state. */ -	__note_new_gpnum(rsp, rnp, rdp); -} - -/*   * Initialize a new grace period.   */  static int rcu_gp_init(struct rcu_state *rsp) @@ -1444,7 +1355,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  		WARN_ON_ONCE(rnp->completed != rsp->completed);  		ACCESS_ONCE(rnp->completed) = rsp->completed;  		if (rnp == rdp->mynode) -			rcu_start_gp_per_cpu(rsp, rnp, rdp); +			__note_gp_changes(rsp, rnp, rdp);  		rcu_preempt_boost_start_gp(rnp);  		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,  					    rnp->level, rnp->grplo, @@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)  		ACCESS_ONCE(rnp->completed) = rsp->gpnum;  		rdp = this_cpu_ptr(rsp->rda);  		if (rnp == rdp->mynode) -			__rcu_process_gp_end(rsp, rnp, rdp); +			__note_gp_changes(rsp, rnp, rdp);  		nocb += rcu_future_gp_cleanup(rsp, rnp);  		raw_spin_unlock_irq(&rnp->lock);  		cond_resched(); @@ -1805,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)  static void  rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)  { -	/* If there is now a new grace period, record and return. */ -	if (check_for_new_grace_period(rsp, rdp)) -		return; +	/* Check for grace-period ends and beginnings. */ +	note_gp_changes(rsp, rdp);  	/*  	 * Does this CPU still need to do its part for current grace period? @@ -2271,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)  	WARN_ON_ONCE(rdp->beenonline == 0); -	/* Handle the end of a grace period that some other CPU ended.  */ -	rcu_process_gp_end(rsp, rdp); -  	/* Update RCU state based on any recent quiescent states. */  	rcu_check_quiescent_state(rsp, rdp); @@ -2358,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,  	if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {  		/* Are we ignoring a completed grace period? */ -		rcu_process_gp_end(rsp, rdp); -		check_for_new_grace_period(rsp, rdp); +		note_gp_changes(rsp, rdp);  		/* Start a new grace period if one not already started. */  		if (!rcu_gp_in_progress(rsp)) { @@ -3120,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void)  	struct task_struct *t;  	for_each_rcu_flavor(rsp) { -		t = kthread_run(rcu_gp_kthread, rsp, rsp->name); +		t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);  		BUG_ON(IS_ERR(t));  		rnp = rcu_get_root(rsp);  		raw_spin_lock_irqsave(&rnp->lock, flags); @@ -3265,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp,   */  static void __init rcu_init_geometry(void)  { +	ulong d;  	int i;  	int j;  	int n = nr_cpu_ids;  	int rcu_capacity[MAX_RCU_LVLS + 1]; +	/* +	 * Initialize any unspecified boot parameters. +	 * The default values of jiffies_till_first_fqs and +	 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS +	 * value, which is a function of HZ, then adding one for each +	 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. +	 */ +	d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; +	if (jiffies_till_first_fqs == ULONG_MAX) +		jiffies_till_first_fqs = d; +	if (jiffies_till_next_fqs == ULONG_MAX) +		jiffies_till_next_fqs = d; +  	/* If the compile-time values are accurate, just leave. */  	if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&  	    nr_cpu_ids == NR_CPUS) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4df503470e42..4a39d364493c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -343,12 +343,17 @@ struct rcu_data {  #define RCU_FORCE_QS		3	/* Need to force quiescent state. */  #define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK -#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */ +#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) +					/* For jiffies_till_first_fqs and */ +					/*  and jiffies_till_next_fqs. */ -#define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */ -						/*  to take at least one */ -						/*  scheduling clock irq */ -						/*  before ratting on them. */ +#define RCU_JIFFIES_FQS_DIV	256	/* Very large systems need more */ +					/*  delay between bouts of */ +					/*  quiescent-state forcing. */ + +#define RCU_STALL_RAT_DELAY	2	/* Allow other CPUs time to take */ +					/*  at least one scheduling clock */ +					/*  irq before ratting on them. */  #define rcu_wait(cond)							\  do {									\ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3db5a375d8dd..63098a59216e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5];  static void __init rcu_bootup_announce_oddness(void)  {  #ifdef CONFIG_RCU_TRACE -	printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); +	pr_info("\tRCU debugfs-based tracing is enabled.\n");  #endif  #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) -	printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", +	pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",  	       CONFIG_RCU_FANOUT);  #endif  #ifdef CONFIG_RCU_FANOUT_EXACT -	printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); +	pr_info("\tHierarchical RCU autobalancing is disabled.\n");  #endif  #ifdef CONFIG_RCU_FAST_NO_HZ -	printk(KERN_INFO -	       "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); +	pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");  #endif  #ifdef CONFIG_PROVE_RCU -	printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); +	pr_info("\tRCU lockdep checking is enabled.\n");  #endif  #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE -	printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); +	pr_info("\tRCU torture testing starts during boot.\n");  #endif  #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) -	printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); +	pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");  #endif  #if defined(CONFIG_RCU_CPU_STALL_INFO) -	printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); +	pr_info("\tAdditional per-CPU info printed with stalls.\n");  #endif  #if NUM_RCU_LVL_4 != 0 -	printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); +	pr_info("\tFour-level hierarchy is enabled.\n");  #endif  	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) -		printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); +		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);  	if (nr_cpu_ids != NR_CPUS) -		printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);  #ifdef CONFIG_RCU_NOCB_CPU  #ifndef CONFIG_RCU_NOCB_CPU_NONE  	if (!have_rcu_nocb_mask) { @@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void)  		have_rcu_nocb_mask = true;  	}  #ifdef CONFIG_RCU_NOCB_CPU_ZERO -	pr_info("\tExperimental no-CBs CPU 0\n"); +	pr_info("\tOffload RCU callbacks from CPU 0\n");  	cpumask_set_cpu(0, rcu_nocb_mask);  #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */  #ifdef CONFIG_RCU_NOCB_CPU_ALL -	pr_info("\tExperimental no-CBs for all CPUs\n"); +	pr_info("\tOffload RCU callbacks from all CPUs\n");  	cpumask_setall(rcu_nocb_mask);  #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */  	if (have_rcu_nocb_mask) {  		cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); -		pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); +		pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);  		if (rcu_nocb_poll) -			pr_info("\tExperimental polled no-CBs CPUs.\n"); +			pr_info("\tPoll for callbacks from no-CBs CPUs.\n");  	}  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */  } @@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);   */  static void __init rcu_bootup_announce(void)  { -	printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); +	pr_info("Preemptible hierarchical RCU implementation.\n");  	rcu_bootup_announce_oddness();  } @@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)  static void rcu_print_task_stall_begin(struct rcu_node *rnp)  { -	printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", +	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",  	       rnp->level, rnp->grplo, rnp->grphi);  }  static void rcu_print_task_stall_end(void)  { -	printk(KERN_CONT "\n"); +	pr_cont("\n");  }  #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)  	t = list_entry(rnp->gp_tasks,  		       struct task_struct, rcu_node_entry);  	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		printk(KERN_CONT " P%d", t->pid); +		pr_cont(" P%d", t->pid);  		ndetected++;  	}  	rcu_print_task_stall_end(); @@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void)  	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);  } +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so.  No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ +	struct task_struct *t = current; + +	if (likely(list_empty(¤t->rcu_node_entry))) +		return; +	t->rcu_read_lock_nesting = 1; +	barrier(); +	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; +	__rcu_read_unlock(); +} +  #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */  static struct rcu_state *rcu_state = &rcu_sched_state; @@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state;   */  static void __init rcu_bootup_announce(void)  { -	printk(KERN_INFO "Hierarchical RCU implementation.\n"); +	pr_info("Hierarchical RCU implementation.\n");  	rcu_bootup_announce_oddness();  } @@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void)  {  } +/* + * Because preemptible RCU does not exist, tasks cannot possibly exit + * while in preemptible RCU read-side critical sections. + */ +void exit_rcu(void) +{ +} +  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */  #ifdef CONFIG_RCU_BOOST @@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void)  		 */  		if (rdp->completed != rnp->completed &&  		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) -			rcu_process_gp_end(rsp, rdp); +			note_gp_changes(rsp, rdp);  		if (cpu_has_callbacks_ready_to_invoke(rdp))  			cbs_ready = true; @@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)  /* Initiate the stall-info list. */  static void print_cpu_stall_info_begin(void)  { -	printk(KERN_CONT "\n"); +	pr_cont("\n");  }  /* @@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)  		ticks_value = rsp->gpnum - rdp->gpnum;  	}  	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); -	printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", +	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",  	       cpu, ticks_value, ticks_title,  	       atomic_read(&rdtp->dynticks) & 0xfff,  	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, @@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)  /* Terminate the stall-info list. */  static void print_cpu_stall_info_end(void)  { -	printk(KERN_ERR "\t"); +	pr_err("\t");  }  /* Zero ->ticks_this_gp for all flavors of RCU. */ @@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void)  static void print_cpu_stall_info_begin(void)  { -	printk(KERN_CONT " {"); +	pr_cont(" {");  }  static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)  { -	printk(KERN_CONT " %d", cpu); +	pr_cont(" %d", cpu);  }  static void print_cpu_stall_info_end(void)  { -	printk(KERN_CONT "} "); +	pr_cont("} ");  }  static void zero_cpu_stall_ticks(struct rcu_data *rdp) diff --git a/kernel/resource.c b/kernel/resource.c index d7386986e10e..3f285dce9347 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)  {  	return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;  } +EXPORT_SYMBOL_GPL(page_is_ram);  void __weak arch_remove_reservations(struct resource *avail)  { @@ -448,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old,  	struct resource *this = root->child;  	struct resource tmp = *new, avail, alloc; -	tmp.flags = new->flags;  	tmp.start = root->start;  	/*  	 * Skip past an allocated resource that starts at 0, since the assignment diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 1e09308bf2a1..0dd6aec1cb6a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -145,6 +145,19 @@ int max_lock_depth = 1024;  /*   * Adjust the priority chain. Also used for deadlock detection.   * Decreases task's usage by one - may thus free the task. + * + * @task: the task owning the mutex (owner) for which a chain walk is probably + *	  needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * 	       things for a task that has just got its priority adjusted, and + *	       is waiting on a mutex) + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + *		 its priority to the mutex owner (can be NULL in the case + *		 depicted above or if the top waiter is gone away and we are + *		 actually deboosting the owner) + * @top_task: the current top waiter + *   * Returns 0 or -EDEADLK.   */  static int rt_mutex_adjust_prio_chain(struct task_struct *task, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1de..54adcf35f495 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)  CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer  endif -obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o  obj-$(CONFIG_SMP) += cpupri.o  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o  obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9e..4a073539c58e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)  	if (IS_ERR(tg))  		goto out_free; -	sched_online_group(tg, &root_task_group); -  	kref_init(&ag->kref);  	init_rwsem(&ag->lock);  	ag->id = atomic_inc_return(&autogroup_seq_nr); @@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)  #endif  	tg->autogroup = ag; +	sched_online_group(tg, &root_task_group);  	return ag;  out_free: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..9b1f2e533b95 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)  static inline bool got_nohz_idle_kick(void)  {  	int cpu = smp_processor_id(); -	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + +	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) +		return false; + +	if (idle_cpu(cpu) && !need_resched()) +		return true; + +	/* +	 * We can't run Idle Load Balance on this CPU for this time so we +	 * cancel it and clear NOHZ_BALANCE_KICK +	 */ +	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); +	return false;  }  #else /* CONFIG_NO_HZ_COMMON */ @@ -667,7 +679,7 @@ void sched_avg_update(struct rq *rq)  {  	s64 period = sched_avg_period(); -	while ((s64)(rq->clock - rq->age_stamp) > period) { +	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {  		/*  		 * Inline assembly required to prevent the compiler  		 * optimising this loop into a divmod call. @@ -1328,7 +1340,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  		p->sched_class->task_woken(rq, p);  	if (rq->idle_stamp) { -		u64 delta = rq->clock - rq->idle_stamp; +		u64 delta = rq_clock(rq) - rq->idle_stamp;  		u64 max = 2*sysctl_sched_migration_cost;  		if (delta > max) @@ -1365,6 +1377,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)  	rq = __task_rq_lock(p);  	if (p->on_rq) { +		/* check_preempt_curr() may use rq clock */ +		update_rq_clock(rq);  		ttwu_do_wakeup(rq, p, wake_flags);  		ret = 1;  	} @@ -1393,8 +1407,9 @@ static void sched_ttwu_pending(void)  void scheduler_ipi(void)  { -	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() -	    && !tick_nohz_full_cpu(smp_processor_id())) +	if (llist_empty(&this_rq()->wake_list) +			&& !tick_nohz_full_cpu(smp_processor_id()) +			&& !got_nohz_idle_kick())  		return;  	/* @@ -1417,7 +1432,7 @@ void scheduler_ipi(void)  	/*  	 * Check if someone kicked us for doing the nohz idle load balance.  	 */ -	if (unlikely(got_nohz_idle_kick() && !need_resched())) { +	if (unlikely(got_nohz_idle_kick())) {  		this_rq()->idle_balance = 1;  		raise_softirq_irqoff(SCHED_SOFTIRQ);  	} @@ -1596,15 +1611,6 @@ static void __sched_fork(struct task_struct *p)  	p->se.vruntime			= 0;  	INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) -	p->se.avg.runnable_avg_period = 0; -	p->se.avg.runnable_avg_sum = 0; -#endif  #ifdef CONFIG_SCHEDSTATS  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif @@ -1748,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p)  	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));  #endif +	/* Initialize new task's runnable average */ +	init_task_runnable_average(p);  	rq = __task_rq_lock(p);  	activate_task(rq, p, 0);  	p->on_rq = 1; @@ -2056,575 +2064,6 @@ unsigned long nr_iowait_cpu(int cpu)  	return atomic_read(&this->nr_iowait);  } -unsigned long this_cpu_load(void) -{ -	struct rq *this = this_rq(); -	return this->cpu_load[0]; -} - - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - *   nr_active = 0; - *   for_each_possible_cpu(cpu) - *   	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - *  - for_each_possible_cpu() is prohibitively expensive on machines with - *    serious number of cpus, therefore we need to take a distributed approach - *    to calculating nr_active. - * - *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - *    So assuming nr_active := 0 when we start out -- true per definition, we - *    can simply take per-cpu deltas and fold those into a global accumulate - *    to obtain the same result. See calc_load_fold_active(). - * - *    Furthermore, in order to avoid synchronizing all per-cpu delta folding - *    across the machine, we assume 10 ticks is sufficient time for every - *    cpu to have completed this task. - * - *    This places an upper-bound on the IRQ-off latency of the machine. Then - *    again, being late doesn't loose the delta, just wrecks the sample. - * - *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - *    this would add another cross-cpu cacheline miss and atomic operation - *    to the wakeup path. Instead we increment on whatever cpu the task ran - *    when it went into uninterruptible state and decrement on whatever cpu - *    did the wakeup. This means that only the sum of nr_uninterruptible over - *    all cpus yields the correct result. - * - *  This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads:	pointer to dest load array - * @offset:	offset to add - * @shift:	shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ -	loads[0] = (avenrun[0] + offset) << shift; -	loads[1] = (avenrun[1] + offset) << shift; -	loads[2] = (avenrun[2] + offset) << shift; -} - -static long calc_load_fold_active(struct rq *this_rq) -{ -	long nr_active, delta = 0; - -	nr_active = this_rq->nr_running; -	nr_active += (long) this_rq->nr_uninterruptible; - -	if (nr_active != this_rq->calc_load_active) { -		delta = nr_active - this_rq->calc_load_active; -		this_rq->calc_load_active = nr_active; -	} - -	return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ -	load *= exp; -	load += active * (FIXED_1 - exp); -	load += 1UL << (FSHIFT - 1); -	return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - *  - When we go NO_HZ idle during the window, we can negate our sample - *    contribution, causing under-accounting. - * - *    We avoid this by keeping two idle-delta counters and flipping them - *    when the window starts, thus separating old and new NO_HZ load. - * - *    The only trick is the slight shift in index flip for read vs write. - * - *        0s            5s            10s           15s - *          +10           +10           +10           +10 - *        |-|-----------|-|-----------|-|-----------|-| - *    r:0 0 1           1 0           0 1           1 0 - *    w:0 1 1           0 0           1 1           0 0 - * - *    This ensures we'll fold the old idle contribution in this window while - *    accumlating the new one. - * - *  - When we wake up from NO_HZ idle during the window, we push up our - *    contribution, since we effectively move our sample point to a known - *    busy state. - * - *    This is solved by pushing the window forward, and thus skipping the - *    sample, for this cpu (effectively using the idle-delta for this cpu which - *    was in effect at the time the window opened). This also solves the issue - *    of having to deal with a cpu having been in NOHZ idle for multiple - *    LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ -	int idx = calc_load_idx; - -	/* -	 * See calc_global_nohz(), if we observe the new index, we also -	 * need to observe the new update time. -	 */ -	smp_rmb(); - -	/* -	 * If the folding window started, make sure we start writing in the -	 * next idle-delta. -	 */ -	if (!time_before(jiffies, calc_load_update)) -		idx++; - -	return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ -	return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ -	struct rq *this_rq = this_rq(); -	long delta; - -	/* -	 * We're going into NOHZ mode, if there's any pending delta, fold it -	 * into the pending idle delta. -	 */ -	delta = calc_load_fold_active(this_rq); -	if (delta) { -		int idx = calc_load_write_idx(); -		atomic_long_add(delta, &calc_load_idle[idx]); -	} -} - -void calc_load_exit_idle(void) -{ -	struct rq *this_rq = this_rq(); - -	/* -	 * If we're still before the sample window, we're done. -	 */ -	if (time_before(jiffies, this_rq->calc_load_update)) -		return; - -	/* -	 * We woke inside or after the sample window, this means we're already -	 * accounted through the nohz accounting, so skip the entire deal and -	 * sync up for the next window. -	 */ -	this_rq->calc_load_update = calc_load_update; -	if (time_before(jiffies, this_rq->calc_load_update + 10)) -		this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ -	int idx = calc_load_read_idx(); -	long delta = 0; - -	if (atomic_long_read(&calc_load_idle[idx])) -		delta = atomic_long_xchg(&calc_load_idle[idx], 0); - -	return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x:         base of the power - * @frac_bits: fractional bits of @x - * @n:         power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ -	unsigned long result = 1UL << frac_bits; - -	if (n) for (;;) { -		if (n & 1) { -			result *= x; -			result += 1UL << (frac_bits - 1); -			result >>= frac_bits; -		} -		n >>= 1; -		if (!n) -			break; -		x *= x; -		x += 1UL << (frac_bits - 1); -		x >>= frac_bits; -	} - -	return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - *    = (a0 * e + a * (1 - e)) * e + a * (1 - e) - *    = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - *  ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - *    = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - *              n         1 - x^(n+1) - *     S_n := \Sum x^i = ------------- - *             i=0          1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, -	    unsigned long active, unsigned int n) -{ - -	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ -	long delta, active, n; - -	if (!time_before(jiffies, calc_load_update + 10)) { -		/* -		 * Catch-up, fold however many we are behind still -		 */ -		delta = jiffies - calc_load_update - 10; -		n = 1 + (delta / LOAD_FREQ); - -		active = atomic_long_read(&calc_load_tasks); -		active = active > 0 ? active * FIXED_1 : 0; - -		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); -		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); -		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - -		calc_load_update += n * LOAD_FREQ; -	} - -	/* -	 * Flip the idle index... -	 * -	 * Make sure we first write the new time then flip the index, so that -	 * calc_load_write_idx() will see the new time when it reads the new -	 * index, this avoids a double flip messing things up. -	 */ -	smp_wmb(); -	calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ -	long active, delta; - -	if (time_before(jiffies, calc_load_update + 10)) -		return; - -	/* -	 * Fold the 'old' idle-delta to include all NO_HZ cpus. -	 */ -	delta = calc_load_fold_idle(); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks); - -	active = atomic_long_read(&calc_load_tasks); -	active = active > 0 ? active * FIXED_1 : 0; - -	avenrun[0] = calc_load(avenrun[0], EXP_1, active); -	avenrun[1] = calc_load(avenrun[1], EXP_5, active); -	avenrun[2] = calc_load(avenrun[2], EXP_15, active); - -	calc_load_update += LOAD_FREQ; - -	/* -	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. -	 */ -	calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ -	long delta; - -	if (time_before(jiffies, this_rq->calc_load_update)) -		return; - -	delta  = calc_load_fold_active(this_rq); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks); - -	this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT		7 -static const unsigned char -		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char -		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { -					{0, 0, 0, 0, 0, 0, 0, 0}, -					{64, 32, 8, 0, 0, 0, 0, 0}, -					{96, 72, 40, 12, 1, 0, 0}, -					{112, 98, 75, 43, 15, 1, 0}, -					{120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ -	int j = 0; - -	if (!missed_updates) -		return load; - -	if (missed_updates >= degrade_zero_ticks[idx]) -		return 0; - -	if (idx == 1) -		return load >> missed_updates; - -	while (missed_updates) { -		if (missed_updates % 2) -			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - -		missed_updates >>= 1; -		j++; -	} -	return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, -			      unsigned long pending_updates) -{ -	int i, scale; - -	this_rq->nr_load_updates++; - -	/* Update our load: */ -	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ -	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { -		unsigned long old_load, new_load; - -		/* scale is effectively 1 << i now, and >> i divides by scale */ - -		old_load = this_rq->cpu_load[i]; -		old_load = decay_load_missed(old_load, pending_updates - 1, i); -		new_load = this_load; -		/* -		 * Round up the averaging division if load is increasing. This -		 * prevents us from getting stuck on 9 if the load is 10, for -		 * example. -		 */ -		if (new_load > old_load) -			new_load += scale - 1; - -		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; -	} - -	sched_avg_update(this_rq); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ -	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); -	unsigned long load = this_rq->load.weight; -	unsigned long pending_updates; - -	/* -	 * bail if there's load or we're actually up-to-date. -	 */ -	if (load || curr_jiffies == this_rq->last_load_update_tick) -		return; - -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	this_rq->last_load_update_tick = curr_jiffies; - -	__update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ -	struct rq *this_rq = this_rq(); -	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); -	unsigned long pending_updates; - -	if (curr_jiffies == this_rq->last_load_update_tick) -		return; - -	raw_spin_lock(&this_rq->lock); -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	if (pending_updates) { -		this_rq->last_load_update_tick = curr_jiffies; -		/* -		 * We were idle, this means load 0, the current load might be -		 * !0 due to remote wakeups and the sort. -		 */ -		__update_cpu_load(this_rq, 0, pending_updates); -	} -	raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Called from scheduler_tick() - */ -static void update_cpu_load_active(struct rq *this_rq) -{ -	/* -	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). -	 */ -	this_rq->last_load_update_tick = jiffies; -	__update_cpu_load(this_rq, this_rq->load.weight, 1); - -	calc_load_account_active(this_rq); -} -  #ifdef CONFIG_SMP  /* @@ -2673,7 +2112,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)  	if (task_current(rq, p)) {  		update_rq_clock(rq); -		ns = rq->clock_task - p->se.exec_start; +		ns = rq_clock_task(rq) - p->se.exec_start;  		if ((s64)ns < 0)  			ns = 0;  	} @@ -2726,8 +2165,8 @@ void scheduler_tick(void)  	raw_spin_lock(&rq->lock);  	update_rq_clock(rq); -	update_cpu_load_active(rq);  	curr->sched_class->task_tick(rq, curr, 0); +	update_cpu_load_active(rq);  	raw_spin_unlock(&rq->lock);  	perf_event_task_tick(); @@ -4745,7 +4184,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  	 */  	idle->sched_class = &idle_sched_class;  	ftrace_graph_init_idle_task(idle, cpu); -	vtime_init_idle(idle); +	vtime_init_idle(idle, cpu);  #if defined(CONFIG_SMP)  	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);  #endif @@ -4947,6 +4386,13 @@ static void migrate_tasks(unsigned int dead_cpu)  	 */  	rq->stop = NULL; +	/* +	 * put_prev_task() and pick_next_task() sched +	 * class method both need to have an up-to-date +	 * value of rq->clock[_task] +	 */ +	update_rq_clock(rq); +  	for ( ; ; ) {  		/*  		 * There's this thread running, bail when that's the only @@ -5080,7 +4526,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)  	return table;  } -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)  {  	struct ctl_table *entry, *table;  	struct sched_domain *sd; @@ -5894,7 +5340,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)  	get_group(cpu, sdd, &sd->groups);  	atomic_inc(&sd->groups->ref); -	if (cpu != cpumask_first(sched_domain_span(sd))) +	if (cpu != cpumask_first(span))  		return 0;  	lockdep_assert_held(&sched_domains_mutex); @@ -5904,12 +5350,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)  	for_each_cpu(i, span) {  		struct sched_group *sg; -		int group = get_group(i, sdd, &sg); -		int j; +		int group, j;  		if (cpumask_test_cpu(i, covered))  			continue; +		group = get_group(i, sdd, &sg);  		cpumask_clear(sched_group_cpus(sg));  		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg)); @@ -5947,7 +5393,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  {  	struct sched_group *sg = sd->groups; -	WARN_ON(!sd || !sg); +	WARN_ON(!sg);  	do {  		sg->group_weight = cpumask_weight(sched_group_cpus(sg)); @@ -6112,6 +5558,9 @@ static struct sched_domain_topology_level default_topology[] = {  static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->init; tl++) +  #ifdef CONFIG_NUMA  static int sched_domains_numa_levels; @@ -6409,7 +5858,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  	struct sched_domain_topology_level *tl;  	int j; -	for (tl = sched_domain_topology; tl->init; tl++) { +	for_each_sd_topology(tl) {  		struct sd_data *sdd = &tl->data;  		sdd->sd = alloc_percpu(struct sched_domain *); @@ -6462,7 +5911,7 @@ static void __sdt_free(const struct cpumask *cpu_map)  	struct sched_domain_topology_level *tl;  	int j; -	for (tl = sched_domain_topology; tl->init; tl++) { +	for_each_sd_topology(tl) {  		struct sd_data *sdd = &tl->data;  		for_each_cpu(j, cpu_map) { @@ -6490,9 +5939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)  }  struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, -		struct s_data *d, const struct cpumask *cpu_map, -		struct sched_domain_attr *attr, struct sched_domain *child, -		int cpu) +		const struct cpumask *cpu_map, struct sched_domain_attr *attr, +		struct sched_domain *child, int cpu)  {  	struct sched_domain *sd = tl->init(tl, cpu);  	if (!sd) @@ -6503,8 +5951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		sd->level = child->level + 1;  		sched_domain_level_max = max(sched_domain_level_max, sd->level);  		child->parent = sd; +		sd->child = child;  	} -	sd->child = child;  	set_domain_attribute(sd, attr);  	return sd; @@ -6517,7 +5965,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  static int build_sched_domains(const struct cpumask *cpu_map,  			       struct sched_domain_attr *attr)  { -	enum s_alloc alloc_state = sa_none; +	enum s_alloc alloc_state;  	struct sched_domain *sd;  	struct s_data d;  	int i, ret = -ENOMEM; @@ -6531,18 +5979,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,  		struct sched_domain_topology_level *tl;  		sd = NULL; -		for (tl = sched_domain_topology; tl->init; tl++) { -			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); +		for_each_sd_topology(tl) { +			sd = build_sched_domain(tl, cpu_map, attr, sd, i); +			if (tl == sched_domain_topology) +				*per_cpu_ptr(d.sd, i) = sd;  			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))  				sd->flags |= SD_OVERLAP;  			if (cpumask_equal(cpu_map, sched_domain_span(sd)))  				break;  		} - -		while (sd->child) -			sd = sd->child; - -		*per_cpu_ptr(d.sd, i) = sd;  	}  	/* Build the groups for the domains */ @@ -6854,9 +6299,6 @@ void __init sched_init_smp(void)  	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);  	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); -	/* RT runtime code needs to handle some hotplug events */ -	hotcpu_notifier(update_runtime, 0); -  	init_hrtick();  	/* Move init over to a non-isolated CPU */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..a7959e05a9d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)  	for (;;) {  		/* Make sure "rtime" is the bigger of stime/rtime */ -		if (stime > rtime) { -			u64 tmp = rtime; rtime = stime; stime = tmp; -		} +		if (stime > rtime) +			swap(rtime, stime);  		/* Make sure 'total' fits in 32 bits */  		if (total >> 32) @@ -747,17 +746,17 @@ void arch_vtime_task_switch(struct task_struct *prev)  	write_seqlock(¤t->vtime_seqlock);  	current->vtime_snap_whence = VTIME_SYS; -	current->vtime_snap = sched_clock(); +	current->vtime_snap = sched_clock_cpu(smp_processor_id());  	write_sequnlock(¤t->vtime_seqlock);  } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu)  {  	unsigned long flags;  	write_seqlock_irqsave(&t->vtime_seqlock, flags);  	t->vtime_snap_whence = VTIME_SYS; -	t->vtime_snap = sched_clock(); +	t->vtime_snap = sched_clock_cpu(cpu);  	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);  } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a673520..e076bddd4c66 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  			cfs_rq->nr_spread_over);  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);  	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_FAIR_GROUP_SCHED  #ifdef CONFIG_SMP -	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg", +	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_load_avg",  			cfs_rq->runnable_load_avg); -	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg", +	SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",  			cfs_rq->blocked_load_avg); -	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_avg", -			(unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); -	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib", +#ifdef CONFIG_FAIR_GROUP_SCHED +	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",  			cfs_rq->tg_load_contrib);  	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",  			cfs_rq->tg_runnable_contrib); +	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg", +			atomic_long_read(&cfs_rq->tg->load_avg));  	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",  			atomic_read(&cfs_rq->tg->runnable_avg));  #endif +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED  	print_cfs_group_stats(m, cpu, cfs_rq->tg);  #endif  } @@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,  						get_nr_threads(p));  	SEQ_printf(m, -		"---------------------------------------------------------\n"); +		"---------------------------------------------------------" +		"----------\n");  #define __P(F) \ -	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) +	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)  #define P(F) \ -	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) +	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)  #define __PN(F) \ -	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))  #define PN(F) \ -	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))  	PN(se.exec_start);  	PN(se.vruntime); @@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  	}  #endif  	__P(nr_switches); -	SEQ_printf(m, "%-35s:%21Ld\n", +	SEQ_printf(m, "%-45s:%21Ld\n",  		   "nr_voluntary_switches", (long long)p->nvcsw); -	SEQ_printf(m, "%-35s:%21Ld\n", +	SEQ_printf(m, "%-45s:%21Ld\n",  		   "nr_involuntary_switches", (long long)p->nivcsw);  	P(se.load.weight); +#ifdef CONFIG_SMP +	P(se.avg.runnable_avg_sum); +	P(se.avg.runnable_avg_period); +	P(se.avg.load_avg_contrib); +	P(se.avg.decay_count); +#endif  	P(policy);  	P(prio);  #undef PN @@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  		t0 = cpu_clock(this_cpu);  		t1 = cpu_clock(this_cpu); -		SEQ_printf(m, "%-35s:%21Ld\n", +		SEQ_printf(m, "%-45s:%21Ld\n",  			   "clock-delta", (long long)(t1-t0));  	}  } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..f77f9c527449 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;  #endif +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ +	lw->weight += inc; +	lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ +	lw->weight -= dec; +	lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ +	lw->weight = w; +	lw->inv_weight = 0; +} +  /*   * Increase the granularity value when there are more CPUs,   * because with more CPUs the 'effective latency' as visible @@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  	return calc_delta_fair(sched_slice(cfs_rq, se), se);  } +#ifdef CONFIG_SMP +static inline void __update_task_entity_contrib(struct sched_entity *se); + +/* Give new task start runnable values to heavy its load in infant time */ +void init_task_runnable_average(struct task_struct *p) +{ +	u32 slice; + +	p->se.avg.decay_count = 0; +	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; +	p->se.avg.runnable_avg_sum = slice; +	p->se.avg.runnable_avg_period = slice; +	__update_task_entity_contrib(&p->se); +} +#else +void init_task_runnable_average(struct task_struct *p) +{ +} +#endif +  /*   * Update the current task's runtime statistics. Skip current tasks that   * are not in our scheduling class. @@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,  static void update_curr(struct cfs_rq *cfs_rq)  {  	struct sched_entity *curr = cfs_rq->curr; -	u64 now = rq_of(cfs_rq)->clock_task; +	u64 now = rq_clock_task(rq_of(cfs_rq));  	unsigned long delta_exec;  	if (unlikely(!curr)) @@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)  static inline void  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)  { -	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); +	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));  }  /* @@ -738,14 +776,14 @@ static void  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, -			rq_of(cfs_rq)->clock - se->statistics.wait_start)); +			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);  	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + -			rq_of(cfs_rq)->clock - se->statistics.wait_start); +			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);  #ifdef CONFIG_SCHEDSTATS  	if (entity_is_task(se)) {  		trace_sched_stat_wait(task_of(se), -			rq_of(cfs_rq)->clock - se->statistics.wait_start); +			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);  	}  #endif  	schedstat_set(se->statistics.wait_start, 0); @@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)  	/*  	 * We are starting a new run period:  	 */ -	se->exec_start = rq_of(cfs_rq)->clock_task; +	se->exec_start = rq_clock_task(rq_of(cfs_rq));  }  /************************************************** @@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)  	 * to gain a more accurate current total weight. See  	 * update_cfs_rq_load_contribution().  	 */ -	tg_weight = atomic64_read(&tg->load_avg); +	tg_weight = atomic_long_read(&tg->load_avg);  	tg_weight -= cfs_rq->tg_load_contrib;  	tg_weight += cfs_rq->load.weight; @@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)  }  #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP  /*   * We choose a half-life close to 1 scheduling period.   * Note: The tables below are dependent on this value. @@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,  						 int force_update)  {  	struct task_group *tg = cfs_rq->tg; -	s64 tg_contrib; +	long tg_contrib;  	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;  	tg_contrib -= cfs_rq->tg_load_contrib; -	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { -		atomic64_add(tg_contrib, &tg->load_avg); +	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { +		atomic_long_add(tg_contrib, &tg->load_avg);  		cfs_rq->tg_load_contrib += tg_contrib;  	}  } @@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)  	u64 contrib;  	contrib = cfs_rq->tg_load_contrib * tg->shares; -	se->avg.load_avg_contrib = div64_u64(contrib, -					     atomic64_read(&tg->load_avg) + 1); +	se->avg.load_avg_contrib = div_u64(contrib, +				     atomic_long_read(&tg->load_avg) + 1);  	/*  	 * For group entities we need to compute a correction term in the case @@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)  	if (!decays && !force_update)  		return; -	if (atomic64_read(&cfs_rq->removed_load)) { -		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); +	if (atomic_long_read(&cfs_rq->removed_load)) { +		unsigned long removed_load; +		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);  		subtract_blocked_load_contrib(cfs_rq, removed_load);  	} @@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)  { -	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); +	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);  	__update_tg_runnable_avg(&rq->avg, &rq->cfs);  } @@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,  	 * We track migrations using entity decay_count <= 0, on a wake-up  	 * migration we use a negative decay count to track the remote decays  	 * accumulated while sleeping. +	 * +	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they +	 * are seen by enqueue_entity_load_avg() as a migration with an already +	 * constructed load_avg_contrib.  	 */  	if (unlikely(se->avg.decay_count <= 0)) { -		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; +		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));  		if (se->avg.decay_count) {  			/*  			 * In a wake-up migration we have to approximate the @@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,  		}  		wakeup = 0;  	} else { -		__synchronize_entity_decay(se); +		/* +		 * Task re-woke on same cpu (or else migrate_task_rq_fair() +		 * would have made count negative); we must be careful to avoid +		 * double-accounting blocked time after synchronizing decays. +		 */ +		se->avg.last_runnable_update += __synchronize_entity_decay(se) +							<< 20;  	}  	/* migrated tasks did not contribute to our blocked load */ @@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		tsk = task_of(se);  	if (se->statistics.sleep_start) { -		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; +		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;  		if ((s64)delta < 0)  			delta = 0; @@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		}  	}  	if (se->statistics.block_start) { -		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; +		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;  		if ((s64)delta < 0)  			delta = 0; @@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  {  	/*  	 * Update the normalized vruntime before updating min_vruntime -	 * through callig update_curr(). +	 * through calling update_curr().  	 */  	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))  		se->vruntime += cfs_rq->min_vruntime; @@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  			struct task_struct *tsk = task_of(se);  			if (tsk->state & TASK_INTERRUPTIBLE) -				se->statistics.sleep_start = rq_of(cfs_rq)->clock; +				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));  			if (tsk->state & TASK_UNINTERRUPTIBLE) -				se->statistics.block_start = rq_of(cfs_rq)->clock; +				se->statistics.block_start = rq_clock(rq_of(cfs_rq));  		}  #endif  	} @@ -2082,7 +2130,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)  	if (unlikely(cfs_rq->throttle_count))  		return cfs_rq->throttled_clock_task; -	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;  }  /* returns 0 on failure to allocate runtime */ @@ -2138,10 +2186,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)  {  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); -	struct rq *rq = rq_of(cfs_rq);  	/* if the deadline is ahead of our clock, nothing to do */ -	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) +	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))  		return;  	if (cfs_rq->runtime_remaining < 0) @@ -2230,7 +2277,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)  #ifdef CONFIG_SMP  	if (!cfs_rq->throttle_count) {  		/* adjust cfs_rq_clock_task() */ -		cfs_rq->throttled_clock_task_time += rq->clock_task - +		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -  					     cfs_rq->throttled_clock_task;  	}  #endif @@ -2245,7 +2292,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)  	/* group is entering throttled state, stop time */  	if (!cfs_rq->throttle_count) -		cfs_rq->throttled_clock_task = rq->clock_task; +		cfs_rq->throttled_clock_task = rq_clock_task(rq);  	cfs_rq->throttle_count++;  	return 0; @@ -2284,7 +2331,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  		rq->nr_running -= task_delta;  	cfs_rq->throttled = 1; -	cfs_rq->throttled_clock = rq->clock; +	cfs_rq->throttled_clock = rq_clock(rq);  	raw_spin_lock(&cfs_b->lock);  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	raw_spin_unlock(&cfs_b->lock); @@ -2298,15 +2345,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	int enqueue = 1;  	long task_delta; -	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; +	se = cfs_rq->tg->se[cpu_of(rq)];  	cfs_rq->throttled = 0; + +	update_rq_clock(rq); +  	raw_spin_lock(&cfs_b->lock); -	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; +	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;  	list_del_rcu(&cfs_rq->throttled_list);  	raw_spin_unlock(&cfs_b->lock); -	update_rq_clock(rq);  	/* update hierarchical throttle state */  	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -2599,10 +2648,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)  	throttle_cfs_rq(cfs_rq);  } -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); -  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)  {  	struct cfs_bandwidth *cfs_b = @@ -2706,7 +2751,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  #else /* CONFIG_CFS_BANDWIDTH */  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)  { -	return rq_of(cfs_rq)->clock_task; +	return rq_clock_task(rq_of(cfs_rq));  }  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, @@ -2919,7 +2964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  /* Used instead of source_load when we know the type == 0 */  static unsigned long weighted_cpuload(const int cpu)  { -	return cpu_rq(cpu)->load.weight; +	return cpu_rq(cpu)->cfs.runnable_load_avg;  }  /* @@ -2964,9 +3009,10 @@ static unsigned long cpu_avg_load_per_task(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	unsigned long nr_running = ACCESS_ONCE(rq->nr_running); +	unsigned long load_avg = rq->cfs.runnable_load_avg;  	if (nr_running) -		return rq->load.weight / nr_running; +		return load_avg / nr_running;  	return 0;  } @@ -3416,12 +3462,6 @@ unlock:  }  /* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED -/*   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and   * cfs_rq_of(p) references at time of call are still valid and identify the   * previous cpu.  However, the caller only guarantees p->pi_lock is held; no @@ -3441,10 +3481,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)  	 */  	if (se->avg.decay_count) {  		se->avg.decay_count = -__synchronize_entity_decay(se); -		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); +		atomic_long_add(se->avg.load_avg_contrib, +						&cfs_rq->removed_load);  	}  } -#endif  #endif /* CONFIG_SMP */  static unsigned long @@ -3946,7 +3986,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	 * 2) too many balance attempts have failed.  	 */ -	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); +	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);  	if (!tsk_cache_hot ||  		env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4141,11 +4181,11 @@ static int tg_load_down(struct task_group *tg, void *data)  	long cpu = (long)data;  	if (!tg->parent) { -		load = cpu_rq(cpu)->load.weight; +		load = cpu_rq(cpu)->avg.load_avg_contrib;  	} else {  		load = tg->parent->cfs_rq[cpu]->h_load; -		load *= tg->se[cpu]->load.weight; -		load /= tg->parent->cfs_rq[cpu]->load.weight + 1; +		load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, +				tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);  	}  	tg->cfs_rq[cpu]->h_load = load; @@ -4171,12 +4211,9 @@ static void update_h_load(long cpu)  static unsigned long task_h_load(struct task_struct *p)  {  	struct cfs_rq *cfs_rq = task_cfs_rq(p); -	unsigned long load; - -	load = p->se.load.weight; -	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); -	return load; +	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, +			cfs_rq->runnable_load_avg + 1);  }  #else  static inline void update_blocked_averages(int cpu) @@ -4189,7 +4226,7 @@ static inline void update_h_load(long cpu)  static unsigned long task_h_load(struct task_struct *p)  { -	return p->se.load.weight; +	return p->se.avg.load_avg_contrib;  }  #endif @@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu)  	age_stamp = ACCESS_ONCE(rq->age_stamp);  	avg = ACCESS_ONCE(rq->rt_avg); -	total = sched_avg_period() + (rq->clock - age_stamp); +	total = sched_avg_period() + (rq_clock(rq) - age_stamp);  	if (unlikely(total < avg)) {  		/* Ensures that power won't end up being negative */ @@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	int pulled_task = 0;  	unsigned long next_balance = jiffies + HZ; -	this_rq->idle_stamp = this_rq->clock; +	this_rq->idle_stamp = rq_clock(this_rq);  	if (this_rq->avg_idle < sysctl_sched_migration_cost)  		return; @@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu)  static inline void set_cpu_sd_state_busy(void)  {  	struct sched_domain *sd; -	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); +	sd = rcu_dereference_check_sched_domain(this_rq()->sd);  	if (!sd || !sd->nohz_idle)  		goto unlock; @@ -5436,10 +5472,9 @@ unlock:  void set_cpu_sd_state_idle(void)  {  	struct sched_domain *sd; -	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); +	sd = rcu_dereference_check_sched_domain(this_rq()->sd);  	if (!sd || sd->nohz_idle)  		goto unlock; @@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)  		se->vruntime -= cfs_rq->min_vruntime;  	} -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP  	/*  	* Remove our load from contribution when we leave sched_fair  	* and ensure we don't carry in an old decay_count if we @@ -5907,9 +5942,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)  #ifndef CONFIG_64BIT  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;  #endif -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP  	atomic64_set(&cfs_rq->decay_counter, 1); -	atomic64_set(&cfs_rq->removed_load, 0); +	atomic_long_set(&cfs_rq->removed_load, 0);  #endif  } @@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  		se = tg->se[i];  		/* Propagate contribution to hierarchy */  		raw_spin_lock_irqsave(&rq->lock, flags); + +		/* Possible calls to update_curr() need rq clock */ +		update_rq_clock(rq);  		for_each_sched_entity(se)  			update_cfs_shares(group_cfs_rq(se));  		raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = {  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED  	.migrate_task_rq	= migrate_task_rq_fair, -#endif +  	.rq_online		= rq_online_fair,  	.rq_offline		= rq_offline_fair, diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 000000000000..16f5a30f9c88 --- /dev/null +++ b/kernel/sched/proc.c @@ -0,0 +1,591 @@ +/* + *  kernel/sched/proc.c + * + *  Kernel load calculations, forked from sched/core.c + */ + +#include <linux/export.h> + +#include "sched.h" + +unsigned long this_cpu_load(void) +{ +	struct rq *this = this_rq(); +	return this->cpu_load[0]; +} + + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + *   nr_active = 0; + *   for_each_possible_cpu(cpu) + *	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + *  - for_each_possible_cpu() is prohibitively expensive on machines with + *    serious number of cpus, therefore we need to take a distributed approach + *    to calculating nr_active. + * + *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + *    So assuming nr_active := 0 when we start out -- true per definition, we + *    can simply take per-cpu deltas and fold those into a global accumulate + *    to obtain the same result. See calc_load_fold_active(). + * + *    Furthermore, in order to avoid synchronizing all per-cpu delta folding + *    across the machine, we assume 10 ticks is sufficient time for every + *    cpu to have completed this task. + * + *    This places an upper-bound on the IRQ-off latency of the machine. Then + *    again, being late doesn't loose the delta, just wrecks the sample. + * + *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + *    this would add another cross-cpu cacheline miss and atomic operation + *    to the wakeup path. Instead we increment on whatever cpu the task ran + *    when it went into uninterruptible state and decrement on whatever cpu + *    did the wakeup. This means that only the sum of nr_uninterruptible over + *    all cpus yields the correct result. + * + *  This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ +	long nr_active, delta = 0; + +	nr_active = this_rq->nr_running; +	nr_active += (long) this_rq->nr_uninterruptible; + +	if (nr_active != this_rq->calc_load_active) { +		delta = nr_active - this_rq->calc_load_active; +		this_rq->calc_load_active = nr_active; +	} + +	return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ +	load *= exp; +	load += active * (FIXED_1 - exp); +	load += 1UL << (FSHIFT - 1); +	return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + *  - When we go NO_HZ idle during the window, we can negate our sample + *    contribution, causing under-accounting. + * + *    We avoid this by keeping two idle-delta counters and flipping them + *    when the window starts, thus separating old and new NO_HZ load. + * + *    The only trick is the slight shift in index flip for read vs write. + * + *        0s            5s            10s           15s + *          +10           +10           +10           +10 + *        |-|-----------|-|-----------|-|-----------|-| + *    r:0 0 1           1 0           0 1           1 0 + *    w:0 1 1           0 0           1 1           0 0 + * + *    This ensures we'll fold the old idle contribution in this window while + *    accumlating the new one. + * + *  - When we wake up from NO_HZ idle during the window, we push up our + *    contribution, since we effectively move our sample point to a known + *    busy state. + * + *    This is solved by pushing the window forward, and thus skipping the + *    sample, for this cpu (effectively using the idle-delta for this cpu which + *    was in effect at the time the window opened). This also solves the issue + *    of having to deal with a cpu having been in NOHZ idle for multiple + *    LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ +	int idx = calc_load_idx; + +	/* +	 * See calc_global_nohz(), if we observe the new index, we also +	 * need to observe the new update time. +	 */ +	smp_rmb(); + +	/* +	 * If the folding window started, make sure we start writing in the +	 * next idle-delta. +	 */ +	if (!time_before(jiffies, calc_load_update)) +		idx++; + +	return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ +	return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ +	struct rq *this_rq = this_rq(); +	long delta; + +	/* +	 * We're going into NOHZ mode, if there's any pending delta, fold it +	 * into the pending idle delta. +	 */ +	delta = calc_load_fold_active(this_rq); +	if (delta) { +		int idx = calc_load_write_idx(); +		atomic_long_add(delta, &calc_load_idle[idx]); +	} +} + +void calc_load_exit_idle(void) +{ +	struct rq *this_rq = this_rq(); + +	/* +	 * If we're still before the sample window, we're done. +	 */ +	if (time_before(jiffies, this_rq->calc_load_update)) +		return; + +	/* +	 * We woke inside or after the sample window, this means we're already +	 * accounted through the nohz accounting, so skip the entire deal and +	 * sync up for the next window. +	 */ +	this_rq->calc_load_update = calc_load_update; +	if (time_before(jiffies, this_rq->calc_load_update + 10)) +		this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ +	int idx = calc_load_read_idx(); +	long delta = 0; + +	if (atomic_long_read(&calc_load_idle[idx])) +		delta = atomic_long_xchg(&calc_load_idle[idx], 0); + +	return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x:         base of the power + * @frac_bits: fractional bits of @x + * @n:         power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ +	unsigned long result = 1UL << frac_bits; + +	if (n) for (;;) { +		if (n & 1) { +			result *= x; +			result += 1UL << (frac_bits - 1); +			result >>= frac_bits; +		} +		n >>= 1; +		if (!n) +			break; +		x *= x; +		x += 1UL << (frac_bits - 1); +		x >>= frac_bits; +	} + +	return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e) + *    = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + *  ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + *    = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + *              n         1 - x^(n+1) + *     S_n := \Sum x^i = ------------- + *             i=0          1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, +	    unsigned long active, unsigned int n) +{ + +	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ +	long delta, active, n; + +	if (!time_before(jiffies, calc_load_update + 10)) { +		/* +		 * Catch-up, fold however many we are behind still +		 */ +		delta = jiffies - calc_load_update - 10; +		n = 1 + (delta / LOAD_FREQ); + +		active = atomic_long_read(&calc_load_tasks); +		active = active > 0 ? active * FIXED_1 : 0; + +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + +		calc_load_update += n * LOAD_FREQ; +	} + +	/* +	 * Flip the idle index... +	 * +	 * Make sure we first write the new time then flip the index, so that +	 * calc_load_write_idx() will see the new time when it reads the new +	 * index, this avoids a double flip messing things up. +	 */ +	smp_wmb(); +	calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ +	long active, delta; + +	if (time_before(jiffies, calc_load_update + 10)) +		return; + +	/* +	 * Fold the 'old' idle-delta to include all NO_HZ cpus. +	 */ +	delta = calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); + +	active = atomic_long_read(&calc_load_tasks); +	active = active > 0 ? active * FIXED_1 : 0; + +	avenrun[0] = calc_load(avenrun[0], EXP_1, active); +	avenrun[1] = calc_load(avenrun[1], EXP_5, active); +	avenrun[2] = calc_load(avenrun[2], EXP_15, active); + +	calc_load_update += LOAD_FREQ; + +	/* +	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. +	 */ +	calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ +	long delta; + +	if (time_before(jiffies, this_rq->calc_load_update)) +		return; + +	delta  = calc_load_fold_active(this_rq); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); + +	this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT		7 +static const unsigned char +		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char +		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { +					{0, 0, 0, 0, 0, 0, 0, 0}, +					{64, 32, 8, 0, 0, 0, 0, 0}, +					{96, 72, 40, 12, 1, 0, 0}, +					{112, 98, 75, 43, 15, 1, 0}, +					{120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ +	int j = 0; + +	if (!missed_updates) +		return load; + +	if (missed_updates >= degrade_zero_ticks[idx]) +		return 0; + +	if (idx == 1) +		return load >> missed_updates; + +	while (missed_updates) { +		if (missed_updates % 2) +			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + +		missed_updates >>= 1; +		j++; +	} +	return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +			      unsigned long pending_updates) +{ +	int i, scale; + +	this_rq->nr_load_updates++; + +	/* Update our load: */ +	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ +	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { +		unsigned long old_load, new_load; + +		/* scale is effectively 1 << i now, and >> i divides by scale */ + +		old_load = this_rq->cpu_load[i]; +		old_load = decay_load_missed(old_load, pending_updates - 1, i); +		new_load = this_load; +		/* +		 * Round up the averaging division if load is increasing. This +		 * prevents us from getting stuck on 9 if the load is 10, for +		 * example. +		 */ +		if (new_load > old_load) +			new_load += scale - 1; + +		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; +	} + +	sched_avg_update(this_rq); +} + +#ifdef CONFIG_SMP +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ +	return rq->cfs.runnable_load_avg; +} +#else +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ +	return rq->load.weight; +} +#endif + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ +	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); +	unsigned long load = get_rq_runnable_load(this_rq); +	unsigned long pending_updates; + +	/* +	 * bail if there's load or we're actually up-to-date. +	 */ +	if (load || curr_jiffies == this_rq->last_load_update_tick) +		return; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	this_rq->last_load_update_tick = curr_jiffies; + +	__update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ +	struct rq *this_rq = this_rq(); +	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); +	unsigned long pending_updates; + +	if (curr_jiffies == this_rq->last_load_update_tick) +		return; + +	raw_spin_lock(&this_rq->lock); +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	if (pending_updates) { +		this_rq->last_load_update_tick = curr_jiffies; +		/* +		 * We were idle, this means load 0, the current load might be +		 * !0 due to remote wakeups and the sort. +		 */ +		__update_cpu_load(this_rq, 0, pending_updates); +	} +	raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ +	unsigned long load = get_rq_runnable_load(this_rq); +	/* +	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). +	 */ +	this_rq->last_load_update_tick = jiffies; +	__update_cpu_load(this_rq, load, 1); + +	calc_load_account_active(this_rq); +} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4ab..01970c8e64df 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)  		(iter = next_task_group(iter)) &&			\  		(rt_rq = iter->rt_rq[cpu_of(rq)]);) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -	list_add_rcu(&rt_rq->leaf_rt_rq_list, -			&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -	list_del_rcu(&rt_rq->leaf_rt_rq_list); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ -	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) -  #define for_each_sched_rt_entity(rt_se) \  	for (; rt_se; rt_se = rt_se->parent) @@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)  #ifdef CONFIG_SMP  static inline const struct cpumask *sched_rt_period_mask(void)  { -	return cpu_rq(smp_processor_id())->rd->span; +	return this_rq()->rd->span;  }  #else  static inline const struct cpumask *sched_rt_period_mask(void) @@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t;  #define for_each_rt_rq(rt_rq, iter, rq) \  	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ -	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -  #define for_each_sched_rt_entity(rt_se) \  	for (; rt_se; rt_se = NULL) @@ -699,15 +674,6 @@ balanced:  	}  } -static void disable_runtime(struct rq *rq) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&rq->lock, flags); -	__disable_runtime(rq); -	raw_spin_unlock_irqrestore(&rq->lock, flags); -} -  static void __enable_runtime(struct rq *rq)  {  	rt_rq_iter_t iter; @@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq)  	}  } -static void enable_runtime(struct rq *rq) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&rq->lock, flags); -	__enable_runtime(rq); -	raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ -	int cpu = (int)(long)hcpu; - -	switch (action) { -	case CPU_DOWN_PREPARE: -	case CPU_DOWN_PREPARE_FROZEN: -		disable_runtime(cpu_rq(cpu)); -		return NOTIFY_OK; - -	case CPU_DOWN_FAILED: -	case CPU_DOWN_FAILED_FROZEN: -	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN: -		enable_runtime(cpu_rq(cpu)); -		return NOTIFY_OK; - -	default: -		return NOTIFY_DONE; -	} -} -  static int balance_runtime(struct rt_rq *rt_rq)  {  	int more = 0; @@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq)  	if (curr->sched_class != &rt_sched_class)  		return; -	delta_exec = rq->clock_task - curr->se.exec_start; +	delta_exec = rq_clock_task(rq) - curr->se.exec_start;  	if (unlikely((s64)delta_exec <= 0))  		return; @@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq)  	curr->se.sum_exec_runtime += delta_exec;  	account_group_exec_runtime(curr, delta_exec); -	curr->se.exec_start = rq->clock_task; +	curr->se.exec_start = rq_clock_task(rq);  	cpuacct_charge(curr, delta_exec);  	sched_rt_avg_update(rq, delta_exec); @@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)  	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))  		return; -	if (!rt_rq->rt_nr_running) -		list_add_leaf_rt_rq(rt_rq); -  	if (head)  		list_add(&rt_se->run_list, queue);  	else @@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)  		__clear_bit(rt_se_prio(rt_se), array->bitmap);  	dec_rt_tasks(rt_se, rt_rq); -	if (!rt_rq->rt_nr_running) -		list_del_leaf_rt_rq(rt_rq);  }  /* @@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)  	} while (rt_rq);  	p = rt_task_of(rt_se); -	p->se.exec_start = rq->clock_task; +	p->se.exec_start = rq_clock_task(rq);  	return p;  } @@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)  	return 0;  } -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +/* + * Return the highest pushable rq's task, which is suitable to be executed + * on the cpu, NULL otherwise + */ +static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)  { -	struct task_struct *next = NULL; -	struct sched_rt_entity *rt_se; -	struct rt_prio_array *array; -	struct rt_rq *rt_rq; -	int idx; - -	for_each_leaf_rt_rq(rt_rq, rq) { -		array = &rt_rq->active; -		idx = sched_find_first_bit(array->bitmap); -next_idx: -		if (idx >= MAX_RT_PRIO) -			continue; -		if (next && next->prio <= idx) -			continue; -		list_for_each_entry(rt_se, array->queue + idx, run_list) { -			struct task_struct *p; +	struct plist_head *head = &rq->rt.pushable_tasks; +	struct task_struct *p; -			if (!rt_entity_is_task(rt_se)) -				continue; +	if (!has_pushable_tasks(rq)) +		return NULL; -			p = rt_task_of(rt_se); -			if (pick_rt_task(rq, p, cpu)) { -				next = p; -				break; -			} -		} -		if (!next) { -			idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); -			goto next_idx; -		} +	plist_for_each_entry(p, head, pushable_tasks) { +		if (pick_rt_task(rq, p, cpu)) +			return p;  	} -	return next; +	return NULL;  }  static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); @@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq)  		double_lock_balance(this_rq, src_rq);  		/* -		 * Are there still pullable RT tasks? +		 * We can pull only a task, which is pushable +		 * on its rq, and no others.  		 */ -		if (src_rq->rt.rt_nr_running <= 1) -			goto skip; - -		p = pick_next_highest_task_rt(src_rq, this_cpu); +		p = pick_highest_pushable_task(src_rq, this_cpu);  		/*  		 * Do we have an RT task that preempts @@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq)  {  	struct task_struct *p = rq->curr; -	p->se.exec_start = rq->clock_task; +	p->se.exec_start = rq_clock_task(rq);  	/* The running task is never eligible for pushing */  	dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d6155..ef0a7b2439dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -10,8 +10,16 @@  #include "cpupri.h"  #include "cpuacct.h" +struct rq; +  extern __read_mostly int scheduler_running; +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern long calc_load_fold_active(struct rq *this_rq); +extern void update_cpu_load_active(struct rq *this_rq); +  /*   * Convert user-nice values [ -20 ... 0 ... 19 ]   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -140,10 +148,11 @@ struct task_group {  	struct cfs_rq **cfs_rq;  	unsigned long shares; -	atomic_t load_weight; -	atomic64_t load_avg; +#ifdef	CONFIG_SMP +	atomic_long_t load_avg;  	atomic_t runnable_avg;  #endif +#endif  #ifdef CONFIG_RT_GROUP_SCHED  	struct sched_rt_entity **rt_se; @@ -261,26 +270,21 @@ struct cfs_rq {  #endif  #ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED  	/*  	 * CFS Load tracking  	 * Under CFS, load is tracked on a per-entity basis and aggregated up.  	 * This allows for the description of both thread and group usage (in  	 * the FAIR_GROUP_SCHED case).  	 */ -	u64 runnable_load_avg, blocked_load_avg; -	atomic64_t decay_counter, removed_load; +	unsigned long runnable_load_avg, blocked_load_avg; +	atomic64_t decay_counter;  	u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +	atomic_long_t removed_load; +  #ifdef CONFIG_FAIR_GROUP_SCHED +	/* Required to track per-cpu representation of a task_group */  	u32 tg_runnable_contrib; -	u64 tg_load_contrib; +	unsigned long tg_load_contrib;  #endif /* CONFIG_FAIR_GROUP_SCHED */  	/* @@ -353,7 +357,6 @@ struct rt_rq {  	unsigned long rt_nr_boosted;  	struct rq *rq; -	struct list_head leaf_rt_rq_list;  	struct task_group *tg;  #endif  }; @@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues);  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)  #define raw_rq()		(&__raw_get_cpu_var(runqueues)) +static inline u64 rq_clock(struct rq *rq) +{ +	return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ +	return rq->clock_task; +} +  #ifdef CONFIG_SMP  #define rcu_dereference_check_sched_domain(p) \ @@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  #define WF_FORK		0x02		/* child wakeup after fork */  #define WF_MIGRATED	0x4		/* internal use, task got migrated */ -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ -	lw->weight += inc; -	lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ -	lw->weight -= dec; -	lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ -	lw->weight = w; -	lw->inv_weight = 0; -} -  /*   * To aid in avoiding the subversion of "niceness" due to uneven distribution   * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);  extern void trigger_load_balance(struct rq *rq, int cpu);  extern void idle_balance(int this_cpu, struct rq *this_rq); -/* - * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg - * becomes useful in lb - */ -#if defined(CONFIG_FAIR_GROUP_SCHED)  extern void idle_enter_fair(struct rq *this_rq);  extern void idle_exit_fair(struct rq *this_rq); -#else -static inline void idle_enter_fair(struct rq *this_rq) {} -static inline void idle_exit_fair(struct rq *this_rq) {} -#endif  #else	/* CONFIG_SMP */ @@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq)  extern void sysrq_sched_debug_show(void);  extern void sched_init_granularity(void);  extern void update_max_interval(void); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);  extern void init_sched_rt_class(void);  extern void init_sched_fair_class(void); @@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime  extern void update_idle_cpu_load(struct rq *this_rq); +extern void init_task_runnable_average(struct task_struct *p); +  #ifdef CONFIG_PARAVIRT  static inline u64 steal_ticks(u64 steal)  { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..17d7065c3872 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)   */  static inline void sched_info_dequeued(struct task_struct *t)  { -	unsigned long long now = task_rq(t)->clock, delta = 0; +	unsigned long long now = rq_clock(task_rq(t)), delta = 0;  	if (unlikely(sched_info_on()))  		if (t->sched_info.last_queued) @@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t)   */  static void sched_info_arrive(struct task_struct *t)  { -	unsigned long long now = task_rq(t)->clock, delta = 0; +	unsigned long long now = rq_clock(task_rq(t)), delta = 0;  	if (t->sched_info.last_queued)  		delta = now - t->sched_info.last_queued; @@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t)  {  	if (unlikely(sched_info_on()))  		if (!t->sched_info.last_queued) -			t->sched_info.last_queued = task_rq(t)->clock; +			t->sched_info.last_queued = rq_clock(task_rq(t));  }  /* @@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t)   */  static inline void sched_info_depart(struct task_struct *t)  { -	unsigned long long delta = task_rq(t)->clock - +	unsigned long long delta = rq_clock(task_rq(t)) -  					t->sched_info.last_arrival;  	rq_sched_info_depart(task_rq(t), delta); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84a..e08fbeeb54b9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)  	struct task_struct *stop = rq->stop;  	if (stop && stop->on_rq) { -		stop->se.exec_start = rq->clock_task; +		stop->se.exec_start = rq_clock_task(rq);  		return stop;  	} @@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)  	struct task_struct *curr = rq->curr;  	u64 delta_exec; -	delta_exec = rq->clock_task - curr->se.exec_start; +	delta_exec = rq_clock_task(rq) - curr->se.exec_start;  	if (unlikely((s64)delta_exec < 0))  		delta_exec = 0; @@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)  	curr->se.sum_exec_runtime += delta_exec;  	account_group_exec_runtime(curr, delta_exec); -	curr->se.exec_start = rq->clock_task; +	curr->se.exec_start = rq_clock_task(rq);  	cpuacct_charge(curr, delta_exec);  } @@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)  {  	struct task_struct *stop = rq->stop; -	stop->se.exec_start = rq->clock_task; +	stop->se.exec_start = rq_clock_task(rq);  }  static void switched_to_stop(struct rq *rq, struct task_struct *p) diff --git a/kernel/signal.c b/kernel/signal.c index 113411bfe8b1..50e41075ac77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  		recalc_sigpending();  		spin_unlock_irq(&tsk->sighand->siglock); -		timeout = schedule_timeout_interruptible(timeout); +		timeout = freezable_schedule_timeout_interruptible(timeout);  		spin_lock_irq(&tsk->sighand->siglock);  		__set_task_blocked(tsk, &tsk->real_blocked); diff --git a/kernel/softirq.c b/kernel/softirq.c index 3d6833f125d3..ca25e6e704a2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)  void local_bh_disable(void)  { -	__local_bh_disable((unsigned long)__builtin_return_address(0), -				SOFTIRQ_DISABLE_OFFSET); +	__local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);  }  EXPORT_SYMBOL(local_bh_disable); @@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt)  	WARN_ON_ONCE(!irqs_disabled());  	if (softirq_count() == cnt) -		trace_softirqs_on((unsigned long)__builtin_return_address(0)); +		trace_softirqs_on(_RET_IP_);  	sub_preempt_count(cnt);  } @@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)  void local_bh_enable(void)  { -	_local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +	_local_bh_enable_ip(_RET_IP_);  }  EXPORT_SYMBOL(local_bh_enable); @@ -229,8 +228,7 @@ asmlinkage void __do_softirq(void)  	pending = local_softirq_pending();  	account_irq_enter_time(current); -	__local_bh_disable((unsigned long)__builtin_return_address(0), -				SOFTIRQ_OFFSET); +	__local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);  	lockdep_softirq_enter();  	cpu = smp_processor_id(); diff --git a/kernel/sys.c b/kernel/sys.c index 2bbd9a73b54c..071de900c824 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -511,7 +511,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,  	case LINUX_REBOOT_CMD_HALT:  		kernel_halt();  		do_exit(0); -		panic("cannot halt"); +		panic("cannot halt.\n");  	case LINUX_REBOOT_CMD_POWER_OFF:  		kernel_power_off(); @@ -1309,6 +1309,17 @@ out:  	return retval;  } +static void set_special_pids(struct pid *pid) +{ +	struct task_struct *curr = current->group_leader; + +	if (task_session(curr) != pid) +		change_pid(curr, PIDTYPE_SID, pid); + +	if (task_pgrp(curr) != pid) +		change_pid(curr, PIDTYPE_PGID, pid); +} +  SYSCALL_DEFINE0(setsid)  {  	struct task_struct *group_leader = current->group_leader; @@ -1328,7 +1339,7 @@ SYSCALL_DEFINE0(setsid)  		goto out;  	group_leader->signal->leader = 1; -	__set_special_pids(sid); +	set_special_pids(sid);  	proc_clear_tty(group_leader); @@ -2355,8 +2366,7 @@ static int do_sysinfo(struct sysinfo *info)  	memset(info, 0, sizeof(struct sysinfo)); -	ktime_get_ts(&tp); -	monotonic_to_bootbased(&tp); +	get_monotonic_boottime(&tp);  	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);  	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..4ce13c3cedb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;  /* Constants used for minimum and  maximum */  #ifdef CONFIG_LOCKUP_DETECTOR  static int sixty = 60; -static int neg_one = -1;  #endif  static int zero; @@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = {  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dowatchdog, -		.extra1		= &neg_one, +		.extra1		= &zero,  		.extra2		= &sixty,  	},  	{ @@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= perf_proc_update_handler,  	}, +	{ +		.procname	= "perf_cpu_time_max_percent", +		.data		= &sysctl_perf_cpu_time_max_percent, +		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent), +		.mode		= 0644, +		.proc_handler	= perf_cpu_time_max_percent_handler, +		.extra1		= &zero, +		.extra2		= &one_hundred, +	},  #endif  #ifdef CONFIG_KMEMCHECK  	{ diff --git a/kernel/time.c b/kernel/time.c index d3617dbd3dca..7c7964c33ae7 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -11,7 +11,7 @@   * Modification history kernel/time.c   *   * 1993-09-02    Philip Gladstone - *      Created file with time related functions from sched.c and adjtimex() + *      Created file with time related functions from sched/core.c and adjtimex()   * 1993-10-08    Torsten Duwe   *      adjtime interface update and CMOS clock write code   * 1995-08-13    Torsten Duwe diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0c739423b0f9..20d6fba70652 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -599,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)  	} else {  		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {  			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); -			if (dev->next_event.tv64 == KTIME_MAX) -				goto out;  			/*  			 * The cpu which was handling the broadcast  			 * timer marked this cpu in the broadcast @@ -615,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)  				goto out;  			/* +			 * Bail out if there is no next event. +			 */ +			if (dev->next_event.tv64 == KTIME_MAX) +				goto out; +			/*  			 * If the pending bit is not set, then we are  			 * either the CPU handling the broadcast  			 * interrupt or we got woken by something else. @@ -698,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  		bc->event_handler = tick_handle_oneshot_broadcast; -		/* Take the do_timer update */ -		if (!tick_nohz_full_cpu(cpu)) -			tick_do_timer_cpu = cpu; -  		/*  		 * We must be careful here. There might be other CPUs  		 * waiting for periodic broadcast. We need to set the diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..0cf1c1453181 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,  		 * we can't safely shutdown that CPU.  		 */  		if (have_nohz_full_mask && tick_do_timer_cpu == cpu) -			return -EINVAL; +			return NOTIFY_BAD;  		break;  	}  	return NOTIFY_OK; diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ead..ce0daa320a26 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)  	return &zone->wait_table[hash_long(val, zone->wait_table_bits)];  }  EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ +	if (BITS_PER_LONG == 64) { +		unsigned long q = (unsigned long)p; +		return bit_waitqueue((void *)(q & ~1), q & 1); +	} +	return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, +				  void *arg) +{ +	struct wait_bit_key *key = arg; +	struct wait_bit_queue *wait_bit +		= container_of(wait, struct wait_bit_queue, wait); +	atomic_t *val = key->flags; + +	if (wait_bit->key.flags != key->flags || +	    wait_bit->key.bit_nr != key->bit_nr || +	    atomic_read(val) != 0) +		return 0; +	return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, +		       int (*action)(atomic_t *), unsigned mode) +{ +	atomic_t *val; +	int ret = 0; + +	do { +		prepare_to_wait(wq, &q->wait, mode); +		val = q->key.flags; +		if (atomic_read(val) == 0) +			ret = (*action)(val); +	} while (!ret && atomic_read(val) != 0); +	finish_wait(wq, &q->wait); +	return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p)					\ +	struct wait_bit_queue name = {					\ +		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\ +		.wait	= {						\ +			.private	= current,			\ +			.func		= wake_atomic_t_function,	\ +			.task_list	=				\ +				LIST_HEAD_INIT((name).wait.task_list),	\ +		},							\ +	} + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), +					 unsigned mode) +{ +	wait_queue_head_t *wq = atomic_t_waitqueue(p); +	DEFINE_WAIT_ATOMIC_T(wait, p); + +	return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @word: The word being waited on, a kernel virtual address + * @bit: The bit of the word being waited on + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ +	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee8e29a2320c..f02c4a4a0c3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;  static bool wq_disable_numa;  module_param_named(disable_numa, wq_disable_numa, bool, 0444); +/* see the comment above the definition of WQ_POWER_EFFICIENT */ +#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT +static bool wq_power_efficient = true; +#else +static bool wq_power_efficient; +#endif + +module_param_named(power_efficient, wq_power_efficient, bool, 0444); +  static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */  /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_unbound_wq);  struct workqueue_struct *system_freezable_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_power_efficient_wq); +struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);  static int worker_thread(void *__worker);  static void copy_workqueue_attrs(struct workqueue_attrs *to, @@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	struct workqueue_struct *wq;  	struct pool_workqueue *pwq; +	/* see the comment above the definition of WQ_POWER_EFFICIENT */ +	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) +		flags |= WQ_UNBOUND; +  	/* allocate wq and format name */  	if (flags & WQ_UNBOUND)  		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); @@ -4985,8 +5002,15 @@ static int __init init_workqueues(void)  					    WQ_UNBOUND_MAX_ACTIVE);  	system_freezable_wq = alloc_workqueue("events_freezable",  					      WQ_FREEZABLE, 0); +	system_power_efficient_wq = alloc_workqueue("events_power_efficient", +					      WQ_POWER_EFFICIENT, 0); +	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", +					      WQ_FREEZABLE | WQ_POWER_EFFICIENT, +					      0);  	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || -	       !system_unbound_wq || !system_freezable_wq); +	       !system_unbound_wq || !system_freezable_wq || +	       !system_power_efficient_wq || +	       !system_freezable_power_efficient_wq);  	return 0;  }  early_initcall(init_workqueues); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index ad83c96b2ece..7e2204db0b1a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void)  /*   * Scheduler hooks for concurrency managed workqueue.  Only to be used from - * sched.c and workqueue.c. + * sched/core.c and workqueue.c.   */  void wq_worker_waking_up(struct task_struct *task, int cpu);  struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); | 

