diff options
Diffstat (limited to 'kernel')
116 files changed, 5238 insertions, 3160 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138bb..808a86ff229d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -55,7 +55,7 @@  #include <linux/times.h>  #include <linux/syscalls.h>  #include <linux/mount.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h>  #include <asm/div64.h>  #include <linux/blkdev.h> /* sector_div */  #include <linux/pid_namespace.h> @@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)  	spin_lock(&acct_lock);  	if (file != acct->file) {  		if (act) -			res = act>0; +			res = act > 0;  		goto out;  	} @@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)  	if (name) {  		struct filename *tmp = getname(name);  		if (IS_ERR(tmp)) -			return (PTR_ERR(tmp)); +			return PTR_ERR(tmp);  		error = acct_on(tmp);  		putname(tmp);  	} else { diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..3ef2e0e797e8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -44,7 +44,7 @@  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/init.h> -#include <asm/types.h> +#include <linux/types.h>  #include <linux/atomic.h>  #include <linux/mm.h>  #include <linux/export.h> @@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)  }  /* + * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * + * This function doesn't consume an skb as might be expected since it has to + * copy it anyways. + */ +static void kauditd_send_multicast_skb(struct sk_buff *skb) +{ +	struct sk_buff		*copy; +	struct audit_net	*aunet = net_generic(&init_net, audit_net_id); +	struct sock		*sock = aunet->nlsk; + +	if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) +		return; + +	/* +	 * The seemingly wasteful skb_copy() rather than bumping the refcount +	 * using skb_get() is necessary because non-standard mods are made to +	 * the skb by the original kaudit unicast socket send routine.  The +	 * existing auditd daemon assumes this breakage.  Fixing this would +	 * require co-ordinating a change in the established protocol between +	 * the kaudit kernel subsystem and the auditd userspace code.  There is +	 * no reason for new multicast clients to continue with this +	 * non-compliance. +	 */ +	copy = skb_copy(skb, GFP_KERNEL); +	if (!copy) +		return; + +	nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); +} + +/*   * flush_hold_queue - empty the hold queue if auditd appears   *   * If auditd just started, drain the queue of messages already @@ -643,13 +675,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)  		if ((task_active_pid_ns(current) != &init_pid_ns))  			return -EPERM; -		if (!capable(CAP_AUDIT_CONTROL)) +		if (!netlink_capable(skb, CAP_AUDIT_CONTROL))  			err = -EPERM;  		break;  	case AUDIT_USER:  	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:  	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: -		if (!capable(CAP_AUDIT_WRITE)) +		if (!netlink_capable(skb, CAP_AUDIT_WRITE))  			err = -EPERM;  		break;  	default:  /* bad msg */ @@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff  *skb)  	mutex_unlock(&audit_cmd_mutex);  } +/* Run custom bind function on netlink socket group connect or bind requests. */ +static int audit_bind(int group) +{ +	if (!capable(CAP_AUDIT_READ)) +		return -EPERM; + +	return 0; +} +  static int __net_init audit_net_init(struct net *net)  {  	struct netlink_kernel_cfg cfg = {  		.input	= audit_receive, +		.bind	= audit_bind, +		.flags	= NL_CFG_F_NONROOT_RECV, +		.groups	= AUDIT_NLGRP_MAX,  	};  	struct audit_net *aunet = net_generic(net, audit_net_id); @@ -1901,10 +1945,10 @@ out:   * audit_log_end - end one audit record   * @ab: the audit_buffer   * - * The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is placed on a queue and a tasklet is scheduled to - * remove them from the queue outside the irq context.  May be called in - * any context. + * netlink_unicast() cannot be called inside an irq context because it blocks + * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed + * on a queue and a tasklet is scheduled to remove them from the queue outside + * the irq context.  May be called in any context.   */  void audit_log_end(struct audit_buffer *ab)  { @@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)  		audit_log_lost("rate limit exceeded");  	} else {  		struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + +		kauditd_send_multicast_skb(ab->skb); + +		/* +		 * The original kaudit unicast socket sends up messages with +		 * nlmsg_len set to the payload length rather than the entire +		 * message length.  This breaks the standard set by netlink. +		 * The existing auditd daemon assumes this breakage.  Fixing +		 * this would require co-ordinating a change in the established +		 * protocol between the kaudit kernel subsystem and the auditd +		 * userspace code. +		 */  		nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;  		if (audit_pid) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f251a5e8d17a..21eae3c05ec0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)  	return AUDIT_BUILD_CONTEXT;  } +static int audit_in_mask(const struct audit_krule *rule, unsigned long val) +{ +	int word, bit; + +	if (val > 0xffffffff) +		return false; + +	word = AUDIT_WORD(val); +	if (word >= AUDIT_BITMASK_SIZE) +		return false; + +	bit = AUDIT_BIT(val); + +	return rule->mask[word] & bit; +} +  /* At syscall entry and exit time, this filter is called if the   * audit_state is not low enough that auditing cannot take place, but is   * also not high enough that we already know we have to write an audit @@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,  	rcu_read_lock();  	if (!list_empty(list)) { -		int word = AUDIT_WORD(ctx->major); -		int bit  = AUDIT_BIT(ctx->major); -  		list_for_each_entry_rcu(e, list, list) { -			if ((e->rule.mask[word] & bit) == bit && +			if (audit_in_mask(&e->rule, ctx->major) &&  			    audit_filter_rules(tsk, &e->rule, ctx, NULL,  					       &state, false)) {  				rcu_read_unlock(); @@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,  static int audit_filter_inode_name(struct task_struct *tsk,  				   struct audit_names *n,  				   struct audit_context *ctx) { -	int word, bit;  	int h = audit_hash_ino((u32)n->ino);  	struct list_head *list = &audit_inode_hash[h];  	struct audit_entry *e;  	enum audit_state state; -	word = AUDIT_WORD(ctx->major); -	bit  = AUDIT_BIT(ctx->major); -  	if (list_empty(list))  		return 0;  	list_for_each_entry_rcu(e, list, list) { -		if ((e->rule.mask[word] & bit) == bit && +		if (audit_in_mask(&e->rule, ctx->major) &&  		    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {  			ctx->current_state = state;  			return 1; diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -19,8 +19,8 @@  static void backtrace_test_normal(void)  { -	printk("Testing a backtrace from process context.\n"); -	printk("The following trace is a kernel self test and not a bug!\n"); +	pr_info("Testing a backtrace from process context.\n"); +	pr_info("The following trace is a kernel self test and not a bug!\n");  	dump_stack();  } @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);  static void backtrace_test_irq(void)  { -	printk("Testing a backtrace from irq context.\n"); -	printk("The following trace is a kernel self test and not a bug!\n"); +	pr_info("Testing a backtrace from irq context.\n"); +	pr_info("The following trace is a kernel self test and not a bug!\n");  	init_completion(&backtrace_work);  	tasklet_schedule(&backtrace_tasklet); @@ -51,8 +51,8 @@ static void backtrace_test_saved(void)  	struct stack_trace trace;  	unsigned long entries[8]; -	printk("Testing a saved backtrace.\n"); -	printk("The following trace is a kernel self test and not a bug!\n"); +	pr_info("Testing a saved backtrace.\n"); +	pr_info("The following trace is a kernel self test and not a bug!\n");  	trace.nr_entries = 0;  	trace.max_entries = ARRAY_SIZE(entries); @@ -65,19 +65,19 @@ static void backtrace_test_saved(void)  #else  static void backtrace_test_saved(void)  { -	printk("Saved backtrace test skipped.\n"); +	pr_info("Saved backtrace test skipped.\n");  }  #endif  static int backtrace_regression_test(void)  { -	printk("====[ backtrace testing ]===========\n"); +	pr_info("====[ backtrace testing ]===========\n");  	backtrace_test_normal();  	backtrace_test_irq();  	backtrace_test_saved(); -	printk("====[ end of backtrace testing ]====\n"); +	pr_info("====[ end of backtrace testing ]====\n");  	return 0;  } diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -24,7 +24,6 @@   */  const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -  EXPORT_SYMBOL(__cap_empty_set);  int file_caps_enabled = 1; @@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)  		 *  		 * An alternative would be to return an error here  		 * (-ERANGE), but that causes legacy applications to -		 * unexpectidly fail; the capget/modify/capset aborts +		 * unexpectedly fail; the capget/modify/capset aborts  		 * before modification is attempted and the application  		 * fails.  		 */ @@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable);   * This does not set PF_SUPERPRIV because the caller may not   * actually be privileged.   */ -bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +bool file_ns_capable(const struct file *file, struct user_namespace *ns, +		     int cap)  {  	if (WARN_ON_ONCE(!cap_valid(cap)))  		return false; @@ -424,23 +424,19 @@ bool capable(int cap)  EXPORT_SYMBOL(capable);  /** - * inode_capable - Check superior capability over inode + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped   * @inode: The inode in question   * @cap: The capability in question   * - * Return true if the current task has the given superior capability - * targeted at it's own user namespace and that the given inode is owned - * by the current user namespace or a child namespace. - * - * Currently we check to see if an inode is owned by the current - * user namespace by seeing if the inode's owner maps into the - * current user namespace. - * + * Return true if the current task has the given capability targeted at + * its own user namespace and that the given inode's uid and gid are + * mapped into the current user namespace.   */ -bool inode_capable(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)  {  	struct user_namespace *ns = current_user_ns(); -	return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); +	return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && +		kgid_has_mapping(ns, inode->i_gid);  } -EXPORT_SYMBOL(inode_capable); +EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9fcdaa705b6c..7868fc3c0bc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -26,6 +26,8 @@   *  distribution for more details.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/cgroup.h>  #include <linux/cred.h>  #include <linux/ctype.h> @@ -33,6 +35,7 @@  #include <linux/init_task.h>  #include <linux/kernel.h>  #include <linux/list.h> +#include <linux/magic.h>  #include <linux/mm.h>  #include <linux/mutex.h>  #include <linux/mount.h> @@ -69,15 +72,6 @@  					 MAX_CFTYPE_NAME + 2)  /* - * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file - * creation/removal and hierarchy changing operations including cgroup - * creation, removal, css association and controller rebinding.  This outer - * lock is needed mainly to resolve the circular dependency between kernfs - * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both. - */ -static DEFINE_MUTEX(cgroup_tree_mutex); - -/*   * cgroup_mutex is the master lock.  Any modification to cgroup or its   * hierarchy must be performed while holding it.   * @@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);  #endif  /* + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + +/*   * Protects cgroup_subsys->release_agent_path.  Modifying it also requires   * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.   */  static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutexes_or_rcu_locked()				\ +#define cgroup_assert_mutex_or_rcu_locked()				\  	rcu_lockdep_assert(rcu_read_lock_held() ||			\ -			   lockdep_is_held(&cgroup_tree_mutex) ||	\  			   lockdep_is_held(&cgroup_mutex),		\ -			   "cgroup_[tree_]mutex or RCU read lock required"); +			   "cgroup_mutex or RCU read lock required");  /*   * cgroup destruction makes heavy use of work items and there can be a lot @@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;   */  static bool cgrp_dfl_root_visible; +/* some controllers are not supported in the default hierarchy */ +static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 +#ifdef CONFIG_CGROUP_DEBUG +	| (1 << debug_cgrp_id) +#endif +	; +  /* The list of hierarchy roots */  static LIST_HEAD(cgroup_roots); @@ -159,14 +165,13 @@ static int cgroup_root_count;  static DEFINE_IDR(cgroup_hierarchy_idr);  /* - * Assign a monotonically increasing serial number to cgroups.  It - * guarantees cgroups with bigger numbers are newer than those with smaller - * numbers.  Also, as cgroups are always appended to the parent's - * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list.  Protected by - * cgroup_mutex. + * Assign a monotonically increasing serial number to csses.  It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list.  Protected by cgroup_mutex.   */ -static u64 cgroup_serial_nr_next = 1; +static u64 css_serial_nr_next = 1;  /* This flag indicates whether tasks in the fork and exit paths should   * check for fork/exit handlers to call. This avoids us having to do @@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[];  static void cgroup_put(struct cgroup *cgrp);  static int rebind_subsystems(struct cgroup_root *dst_root, -			     unsigned long ss_mask); -static void cgroup_destroy_css_killed(struct cgroup *cgrp); +			     unsigned int ss_mask);  static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); +static void kill_css(struct cgroup_subsys_state *css);  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],  			      bool is_add);  static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, +			    gfp_t gfp_mask) +{ +	int ret; + +	idr_preload(gfp_mask); +	spin_lock_bh(&cgroup_idr_lock); +	ret = idr_alloc(idr, ptr, start, end, gfp_mask); +	spin_unlock_bh(&cgroup_idr_lock); +	idr_preload_end(); +	return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ +	void *ret; + +	spin_lock_bh(&cgroup_idr_lock); +	ret = idr_replace(idr, ptr, id); +	spin_unlock_bh(&cgroup_idr_lock); +	return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ +	spin_lock_bh(&cgroup_idr_lock); +	idr_remove(idr, id); +	spin_unlock_bh(&cgroup_idr_lock); +} + +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ +	struct cgroup_subsys_state *parent_css = cgrp->self.parent; + +	if (parent_css) +		return container_of(parent_css, struct cgroup, self); +	return NULL; +} +  /**   * cgroup_css - obtain a cgroup's css for the specified subsystem   * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self)   *   * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This   * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,  {  	if (ss)  		return rcu_dereference_check(cgrp->subsys[ss->id], -					lockdep_is_held(&cgroup_tree_mutex) ||  					lockdep_is_held(&cgroup_mutex));  	else -		return &cgrp->dummy_css; +		return &cgrp->self; +} + +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled.  If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, +						struct cgroup_subsys *ss) +{ +	lockdep_assert_held(&cgroup_mutex); + +	if (!ss) +		return &cgrp->self; + +	if (!(cgrp->root->subsys_mask & (1 << ss->id))) +		return NULL; + +	while (cgroup_parent(cgrp) && +	       !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) +		cgrp = cgroup_parent(cgrp); + +	return cgroup_css(cgrp, ss);  }  /* convenient tests for these bits */  static inline bool cgroup_is_dead(const struct cgroup *cgrp)  { -	return test_bit(CGRP_DEAD, &cgrp->flags); +	return !(cgrp->self.flags & CSS_ONLINE);  } -struct cgroup_subsys_state *seq_css(struct seq_file *seq) +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)  { -	struct kernfs_open_file *of = seq->private;  	struct cgroup *cgrp = of->kn->parent->priv; -	struct cftype *cft = seq_cft(seq); +	struct cftype *cft = of_cft(of);  	/*  	 * This is open and unprotected implementation of cgroup_css(). @@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)  	if (cft->ss)  		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);  	else -		return &cgrp->dummy_css; +		return &cgrp->self;  } -EXPORT_SYMBOL_GPL(seq_css); +EXPORT_SYMBOL_GPL(of_css);  /**   * cgroup_is_descendant - test ancestry @@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)  	while (cgrp) {  		if (cgrp == ancestor)  			return true; -		cgrp = cgrp->parent; +		cgrp = cgroup_parent(cgrp);  	}  	return false;  } @@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)   * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end   * @cgrp: the target cgroup to iterate css's of   * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex.   */  #define for_each_css(css, ssid, cgrp)					\  	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\  		if (!((css) = rcu_dereference_check(			\  				(cgrp)->subsys[(ssid)],			\ -				lockdep_is_held(&cgroup_tree_mutex) ||	\  				lockdep_is_held(&cgroup_mutex)))) { }	\  		else  /** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp)					\ +	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\ +		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ +			;						\ +		else + +/**   * for_each_subsys - iterate all enabled cgroup subsystems   * @ss: the iteration cursor   * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)  #define for_each_root(root)						\  	list_for_each_entry((root), &cgroup_roots, root_list) -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked.  On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ -	mutex_lock(&cgroup_mutex); -	if (cgroup_is_dead(cgrp)) { -		mutex_unlock(&cgroup_mutex); -		return false; -	} -	return true; -} +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp)				\ +	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ +		if (({ lockdep_assert_held(&cgroup_mutex);		\ +		       cgroup_is_dead(child); }))			\ +			;						\ +		else  /* the list of cgroups eligible for automatic release. Protected by   * release_list_lock */ @@ -348,7 +425,7 @@ struct cgrp_cset_link {   * reference-counted, to improve performance when child cgroups   * haven't been created.   */ -static struct css_set init_css_set = { +struct css_set init_css_set = {  	.refcount		= ATOMIC_INIT(1),  	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),  	.tasks			= LIST_HEAD_INIT(init_css_set.tasks), @@ -359,6 +436,43 @@ static struct css_set init_css_set = {  static int css_set_count	= 1;	/* 1 for init_css_set */ +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly.  The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed.  This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ +	lockdep_assert_held(&css_set_rwsem); + +	do { +		bool trigger; + +		if (populated) +			trigger = !cgrp->populated_cnt++; +		else +			trigger = !--cgrp->populated_cnt; + +		if (!trigger) +			break; + +		if (cgrp->populated_kn) +			kernfs_notify(cgrp->populated_kn); +		cgrp = cgroup_parent(cgrp); +	} while (cgrp); +} +  /*   * hash table for cgroup groups. This improves the performance to find   * an existing css_set. This hash doesn't (currently) take into @@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])  static void put_css_set_locked(struct css_set *cset, bool taskexit)  {  	struct cgrp_cset_link *link, *tmp_link; +	struct cgroup_subsys *ss; +	int ssid;  	lockdep_assert_held(&css_set_rwsem); @@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)  		return;  	/* This css_set is dead. unlink it and release cgroup refcounts */ +	for_each_subsys(ss, ssid) +		list_del(&cset->e_cset_node[ssid]);  	hash_del(&cset->hlist);  	css_set_count--; @@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)  		list_del(&link->cgrp_link);  		/* @cgrp can't go away while we're holding css_set_rwsem */ -		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { -			if (taskexit) -				set_bit(CGRP_RELEASABLE, &cgrp->flags); -			check_for_release(cgrp); +		if (list_empty(&cgrp->cset_links)) { +			cgroup_update_populated(cgrp, false); +			if (notify_on_release(cgrp)) { +				if (taskexit) +					set_bit(CGRP_RELEASABLE, &cgrp->flags); +				check_for_release(cgrp); +			}  		}  		kfree(link); @@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,  {  	struct list_head *l1, *l2; -	if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { -		/* Not all subsystems matched */ +	/* +	 * On the default hierarchy, there can be csets which are +	 * associated with the same set of cgroups but different csses. +	 * Let's first ensure that csses match. +	 */ +	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))  		return false; -	}  	/*  	 * Compare cgroup pointers in order to distinguish between -	 * different cgroups in heirarchies with no subsystems. We -	 * could get by with just this check alone (and skip the -	 * memcmp above) but on most setups the memcmp check will -	 * avoid the need for this more expensive check on almost all -	 * candidates. +	 * different cgroups in hierarchies.  As different cgroups may +	 * share the same effective css, this comparison is always +	 * necessary.  	 */ -  	l1 = &cset->cgrp_links;  	l2 = &old_cset->cgrp_links;  	while (1) { @@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,  	 * won't change, so no need for locking.  	 */  	for_each_subsys(ss, i) { -		if (root->cgrp.subsys_mask & (1UL << i)) { -			/* Subsystem is in this hierarchy. So we want -			 * the subsystem state from the new -			 * cgroup */ -			template[i] = cgroup_css(cgrp, ss); +		if (root->subsys_mask & (1UL << i)) { +			/* +			 * @ss is in this hierarchy, so we want the +			 * effective css from @cgrp. +			 */ +			template[i] = cgroup_e_css(cgrp, ss);  		} else { -			/* Subsystem is not in this hierarchy, so we -			 * don't want to change the subsystem state */ +			/* +			 * @ss is not in this hierarchy, so we don't want +			 * to change the css. +			 */  			template[i] = old_cset->subsys[i];  		}  	} @@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,  	struct cgrp_cset_link *link;  	BUG_ON(list_empty(tmp_links)); + +	if (cgroup_on_dfl(cgrp)) +		cset->dfl_cgrp = cgrp; +  	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);  	link->cset = cset;  	link->cgrp = cgrp; + +	if (list_empty(&cgrp->cset_links)) +		cgroup_update_populated(cgrp, true);  	list_move(&link->cset_link, &cgrp->cset_links); +  	/*  	 * Always add links to the tail of the list so that the list  	 * is sorted by order of hierarchy creation @@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	struct css_set *cset;  	struct list_head tmp_links;  	struct cgrp_cset_link *link; +	struct cgroup_subsys *ss;  	unsigned long key; +	int ssid;  	lockdep_assert_held(&cgroup_mutex); @@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	css_set_count++; -	/* Add this cgroup group to the hash table */ +	/* Add @cset to the hash table */  	key = css_set_hash(cset->subsys);  	hash_add(css_set_table, &cset->hlist, key); +	for_each_subsys(ss, ssid) +		list_add_tail(&cset->e_cset_node[ssid], +			      &cset->subsys[ssid]->cgroup->e_csets[ssid]); +  	up_write(&css_set_rwsem);  	return cset; @@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)  	struct cgroup *cgrp = &root->cgrp;  	struct cgrp_cset_link *link, *tmp_link; -	mutex_lock(&cgroup_tree_mutex);  	mutex_lock(&cgroup_mutex);  	BUG_ON(atomic_read(&root->nr_cgrps)); -	BUG_ON(!list_empty(&cgrp->children)); +	BUG_ON(!list_empty(&cgrp->self.children));  	/* Rebind all subsystems back to the default hierarchy */ -	rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); +	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);  	/*  	 * Release all the links from cset_links to this hierarchy's @@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)  	cgroup_exit_root_id(root);  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex);  	kernfs_destroy_root(root->kf_root);  	cgroup_free_root(root); @@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,   * update of a tasks cgroup pointer by cgroup_attach_task()   */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);  static struct kernfs_syscall_ops cgroup_kf_syscall_ops;  static const struct file_operations proc_cgroupstats_operations; @@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)  	if (cft->read_u64 || cft->read_s64 || cft->seq_show)  		mode |= S_IRUGO; -	if (cft->write_u64 || cft->write_s64 || cft->write_string || -	    cft->trigger) +	if (cft->write_u64 || cft->write_s64 || cft->write)  		mode |= S_IWUSR;  	return mode;  } -static void cgroup_free_fn(struct work_struct *work) +static void cgroup_get(struct cgroup *cgrp)  { -	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - -	atomic_dec(&cgrp->root->nr_cgrps); -	cgroup_pidlist_destroy_all(cgrp); - -	if (cgrp->parent) { -		/* -		 * We get a ref to the parent, and put the ref when this -		 * cgroup is being freed, so it's guaranteed that the -		 * parent won't be destroyed before its children. -		 */ -		cgroup_put(cgrp->parent); -		kernfs_put(cgrp->kn); -		kfree(cgrp); -	} else { -		/* -		 * This is root cgroup's refcnt reaching zero, which -		 * indicates that the root should be released. -		 */ -		cgroup_destroy_root(cgrp->root); -	} +	WARN_ON_ONCE(cgroup_is_dead(cgrp)); +	css_get(&cgrp->self);  } -static void cgroup_free_rcu(struct rcu_head *head) +static void cgroup_put(struct cgroup *cgrp)  { -	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - -	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); -	queue_work(cgroup_destroy_wq, &cgrp->destroy_work); +	css_put(&cgrp->self);  } -static void cgroup_get(struct cgroup *cgrp) +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded.  Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time.  If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn)  { -	WARN_ON_ONCE(cgroup_is_dead(cgrp)); -	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); -	atomic_inc(&cgrp->refcnt); +	struct cgroup *cgrp; + +	if (kernfs_type(kn) == KERNFS_DIR) +		cgrp = kn->priv; +	else +		cgrp = kn->parent->priv; + +	mutex_unlock(&cgroup_mutex); + +	kernfs_unbreak_active_protection(kn); +	cgroup_put(cgrp);  } -static void cgroup_put(struct cgroup *cgrp) +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn.  It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive.  Returns the cgroup if + * alive; otherwise, %NULL.  A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper.  It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)  { -	if (!atomic_dec_and_test(&cgrp->refcnt)) -		return; -	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) -		return; +	struct cgroup *cgrp; + +	if (kernfs_type(kn) == KERNFS_DIR) +		cgrp = kn->priv; +	else +		cgrp = kn->parent->priv;  	/* -	 * XXX: cgrp->id is only used to look up css's.  As cgroup and -	 * css's lifetimes will be decoupled, it should be made -	 * per-subsystem and moved to css->id so that lookups are -	 * successful until the target css is released. +	 * We're gonna grab cgroup_mutex which nests outside kernfs +	 * active_ref.  cgroup liveliness check alone provides enough +	 * protection against removal.  Ensure @cgrp stays accessible and +	 * break the active_ref protection.  	 */ +	cgroup_get(cgrp); +	kernfs_break_active_protection(kn); +  	mutex_lock(&cgroup_mutex); -	idr_remove(&cgrp->root->cgroup_idr, cgrp->id); -	mutex_unlock(&cgroup_mutex); -	cgrp->id = -1; -	call_rcu(&cgrp->rcu_head, cgroup_free_rcu); +	if (!cgroup_is_dead(cgrp)) +		return cgrp; + +	cgroup_kn_unlock(kn); +	return NULL;  }  static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  {  	char name[CGROUP_FILE_NAME_MAX]; -	lockdep_assert_held(&cgroup_tree_mutex); +	lockdep_assert_held(&cgroup_mutex);  	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));  } @@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)   * @cgrp: target cgroup   * @subsys_mask: mask of the subsystem ids whose files should be removed   */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)  {  	struct cgroup_subsys *ss;  	int i; @@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)  	for_each_subsys(ss, i) {  		struct cftype *cfts; -		if (!test_bit(i, &subsys_mask)) +		if (!(subsys_mask & (1 << i)))  			continue;  		list_for_each_entry(cfts, &ss->cfts, node)  			cgroup_addrm_files(cgrp, cfts, false);  	}  } -static int rebind_subsystems(struct cgroup_root *dst_root, -			     unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)  {  	struct cgroup_subsys *ss; -	int ssid, ret; +	unsigned int tmp_ss_mask; +	int ssid, i, ret; -	lockdep_assert_held(&cgroup_tree_mutex);  	lockdep_assert_held(&cgroup_mutex);  	for_each_subsys(ss, ssid) {  		if (!(ss_mask & (1 << ssid)))  			continue; -		/* if @ss is on the dummy_root, we can always move it */ -		if (ss->root == &cgrp_dfl_root) -			continue; - -		/* if @ss has non-root cgroups attached to it, can't move */ -		if (!list_empty(&ss->root->cgrp.children)) +		/* if @ss has non-root csses attached to it, can't move */ +		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))  			return -EBUSY;  		/* can't move between two non-dummy roots either */ -		if (dst_root != &cgrp_dfl_root) +		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)  			return -EBUSY;  	} -	ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); +	/* skip creating root files on dfl_root for inhibited subsystems */ +	tmp_ss_mask = ss_mask; +	if (dst_root == &cgrp_dfl_root) +		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + +	ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);  	if (ret) {  		if (dst_root != &cgrp_dfl_root)  			return ret; @@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,  		 * Just warn about it and continue.  		 */  		if (cgrp_dfl_root_visible) { -			pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", -				   ret, ss_mask); -			pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); +			pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", +				ret, ss_mask); +			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");  		}  	} @@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,  	 * Nothing can fail from this point on.  Remove files for the  	 * removed subsystems and rebind each subsystem.  	 */ -	mutex_unlock(&cgroup_mutex);  	for_each_subsys(ss, ssid)  		if (ss_mask & (1 << ssid))  			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); -	mutex_lock(&cgroup_mutex);  	for_each_subsys(ss, ssid) {  		struct cgroup_root *src_root;  		struct cgroup_subsys_state *css; +		struct css_set *cset;  		if (!(ss_mask & (1 << ssid)))  			continue; @@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,  		ss->root = dst_root;  		css->cgroup = &dst_root->cgrp; -		src_root->cgrp.subsys_mask &= ~(1 << ssid); -		dst_root->cgrp.subsys_mask |= 1 << ssid; +		down_write(&css_set_rwsem); +		hash_for_each(css_set_table, i, cset, hlist) +			list_move_tail(&cset->e_cset_node[ss->id], +				       &dst_root->cgrp.e_csets[ss->id]); +		up_write(&css_set_rwsem); + +		src_root->subsys_mask &= ~(1 << ssid); +		src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + +		/* default hierarchy doesn't enable controllers by default */ +		dst_root->subsys_mask |= 1 << ssid; +		if (dst_root != &cgrp_dfl_root) +			dst_root->cgrp.child_subsys_mask |= 1 << ssid;  		if (ss->bind)  			ss->bind(css); @@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,  	int ssid;  	for_each_subsys(ss, ssid) -		if (root->cgrp.subsys_mask & (1 << ssid)) +		if (root->subsys_mask & (1 << ssid))  			seq_printf(seq, ",%s", ss->name);  	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)  		seq_puts(seq, ",sane_behavior"); @@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,  }  struct cgroup_sb_opts { -	unsigned long subsys_mask; -	unsigned long flags; +	unsigned int subsys_mask; +	unsigned int flags;  	char *release_agent;  	bool cpuset_clone_children;  	char *name; @@ -1100,24 +1262,16 @@ struct cgroup_sb_opts {  	bool none;  }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  {  	char *token, *o = data;  	bool all_ss = false, one_ss = false; -	unsigned long mask = (unsigned long)-1; +	unsigned int mask = -1U;  	struct cgroup_subsys *ss;  	int i; -	BUG_ON(!mutex_is_locked(&cgroup_mutex)); -  #ifdef CONFIG_CPUSETS -	mask = ~(1UL << cpuset_cgrp_id); +	mask = ~(1U << cpuset_cgrp_id);  #endif  	memset(opts, 0, sizeof(*opts)); @@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			/* Mutually exclusive option 'all' + subsystem name */  			if (all_ss)  				return -EINVAL; -			set_bit(i, &opts->subsys_mask); +			opts->subsys_mask |= (1 << i);  			one_ss = true;  			break; @@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	/* Consistency checks */  	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { -		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); +		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");  		if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||  		    opts->cpuset_clone_children || opts->release_agent ||  		    opts->name) { -			pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); +			pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");  			return -EINVAL;  		}  	} else { @@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  		if (all_ss || (!one_ss && !opts->none && !opts->name))  			for_each_subsys(ss, i)  				if (!ss->disabled) -					set_bit(i, &opts->subsys_mask); +					opts->subsys_mask |= (1 << i);  		/*  		 * We either have to specify by name or by subsystems. (So @@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)  	int ret = 0;  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);  	struct cgroup_sb_opts opts; -	unsigned long added_mask, removed_mask; +	unsigned int added_mask, removed_mask;  	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { -		pr_err("cgroup: sane_behavior: remount is not allowed\n"); +		pr_err("sane_behavior: remount is not allowed\n");  		return -EINVAL;  	} -	mutex_lock(&cgroup_tree_mutex);  	mutex_lock(&cgroup_mutex);  	/* See what subsystems are wanted */ @@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)  	if (ret)  		goto out_unlock; -	if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) -		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", -			   task_tgid_nr(current), current->comm); +	if (opts.subsys_mask != root->subsys_mask || opts.release_agent) +		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", +			task_tgid_nr(current), current->comm); -	added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; -	removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; +	added_mask = opts.subsys_mask & ~root->subsys_mask; +	removed_mask = root->subsys_mask & ~opts.subsys_mask;  	/* Don't allow flags or name to change at remount */  	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||  	    (opts.name && strcmp(opts.name, root->name))) { -		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", +		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",  		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",  		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);  		ret = -EINVAL; @@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)  	}  	/* remounting is not allowed for populated hierarchies */ -	if (!list_empty(&root->cgrp.children)) { +	if (!list_empty(&root->cgrp.self.children)) {  		ret = -EBUSY;  		goto out_unlock;  	} @@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)  	kfree(opts.release_agent);  	kfree(opts.name);  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex);  	return ret;  } @@ -1369,14 +1521,22 @@ out_unlock:  static void init_cgroup_housekeeping(struct cgroup *cgrp)  { -	atomic_set(&cgrp->refcnt, 1); -	INIT_LIST_HEAD(&cgrp->sibling); -	INIT_LIST_HEAD(&cgrp->children); +	struct cgroup_subsys *ss; +	int ssid; + +	INIT_LIST_HEAD(&cgrp->self.sibling); +	INIT_LIST_HEAD(&cgrp->self.children);  	INIT_LIST_HEAD(&cgrp->cset_links);  	INIT_LIST_HEAD(&cgrp->release_list);  	INIT_LIST_HEAD(&cgrp->pidlists);  	mutex_init(&cgrp->pidlist_mutex); -	cgrp->dummy_css.cgroup = cgrp; +	cgrp->self.cgroup = cgrp; +	cgrp->self.flags |= CSS_ONLINE; + +	for_each_subsys(ss, ssid) +		INIT_LIST_HEAD(&cgrp->e_csets[ssid]); + +	init_waitqueue_head(&cgrp->offline_waitq);  }  static void init_cgroup_root(struct cgroup_root *root, @@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,  		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);  } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)  {  	LIST_HEAD(tmp_links);  	struct cgroup *root_cgrp = &root->cgrp;  	struct css_set *cset;  	int i, ret; -	lockdep_assert_held(&cgroup_tree_mutex);  	lockdep_assert_held(&cgroup_mutex); -	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); +	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);  	if (ret < 0)  		goto out;  	root_cgrp->id = ret; +	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); +	if (ret) +		goto out; +  	/*  	 * We're accessing css_set_count without locking css_set_rwsem here,  	 * but that's OK - it can only be increased by someone holding @@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)  	 */  	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);  	if (ret) -		goto out; +		goto cancel_ref;  	ret = cgroup_init_root_id(root);  	if (ret) -		goto out; +		goto cancel_ref;  	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,  					   KERNFS_ROOT_CREATE_DEACTIVATED, @@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)  		link_css_set(&tmp_links, cset, root_cgrp);  	up_write(&css_set_rwsem); -	BUG_ON(!list_empty(&root_cgrp->children)); +	BUG_ON(!list_empty(&root_cgrp->self.children));  	BUG_ON(atomic_read(&root->nr_cgrps) != 1);  	kernfs_activate(root_cgrp->kn); @@ -1474,6 +1637,8 @@ destroy_root:  	root->kf_root = NULL;  exit_root_id:  	cgroup_exit_root_id(root); +cancel_ref: +	percpu_ref_cancel_init(&root_cgrp->self.refcnt);  out:  	free_cgrp_cset_links(&tmp_links);  	return ret; @@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	 */  	if (!use_task_css_set_links)  		cgroup_enable_task_cg_lists(); -retry: -	mutex_lock(&cgroup_tree_mutex); +  	mutex_lock(&cgroup_mutex);  	/* First find the desired set of subsystems */ @@ -1535,7 +1699,7 @@ retry:  		 * subsystems) then they must match.  		 */  		if ((opts.subsys_mask || opts.none) && -		    (opts.subsys_mask != root->cgrp.subsys_mask)) { +		    (opts.subsys_mask != root->subsys_mask)) {  			if (!name_match)  				continue;  			ret = -EBUSY; @@ -1544,28 +1708,27 @@ retry:  		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {  			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { -				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); +				pr_err("sane_behavior: new mount options should match the existing superblock\n");  				ret = -EINVAL;  				goto out_unlock;  			} else { -				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); +				pr_warn("new mount options do not match the existing superblock, will be ignored\n");  			}  		}  		/* -		 * A root's lifetime is governed by its root cgroup.  Zero -		 * ref indicate that the root is being destroyed.  Wait for -		 * destruction to complete so that the subsystems are free. -		 * We can use wait_queue for the wait but this path is -		 * super cold.  Let's just sleep for a bit and retry. +		 * A root's lifetime is governed by its root cgroup. +		 * tryget_live failure indicate that the root is being +		 * destroyed.  Wait for destruction to complete so that the +		 * subsystems are free.  We can use wait_queue for the wait +		 * but this path is super cold.  Let's just sleep for a bit +		 * and retry.  		 */ -		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { +		if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {  			mutex_unlock(&cgroup_mutex); -			mutex_unlock(&cgroup_tree_mutex); -			kfree(opts.release_agent); -			kfree(opts.name);  			msleep(10); -			goto retry; +			ret = restart_syscall(); +			goto out_free;  		}  		ret = 0; @@ -1596,15 +1759,15 @@ retry:  out_unlock:  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex); - +out_free:  	kfree(opts.release_agent);  	kfree(opts.name);  	if (ret)  		return ERR_PTR(ret); -	dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); +	dentry = kernfs_mount(fs_type, flags, root->kf_root, +				CGROUP_SUPER_MAGIC, &new_sb);  	if (IS_ERR(dentry) || !new_sb)  		cgroup_put(&root->cgrp);  	return dentry; @@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb)  	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);  	struct cgroup_root *root = cgroup_root_from_kf(kf_root); -	cgroup_put(&root->cgrp); +	/* +	 * If @root doesn't have any mounts or children, start killing it. +	 * This prevents new mounts by disabling percpu_ref_tryget_live(). +	 * cgroup_mount() may wait for @root's release. +	 * +	 * And don't kill the default root. +	 */ +	if (css_has_online_children(&root->cgrp.self) || +	    root == &cgrp_dfl_root) +		cgroup_put(&root->cgrp); +	else +		percpu_ref_kill(&root->cgrp.self.refcnt); +  	kernfs_kill_sb(sb);  } @@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)  /**   * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp; the cgroup @tsk is being migrated from + * @old_cgrp: the cgroup @tsk is being migrated from   * @tsk: the task being migrated   * @new_cset: the new css_set @tsk is being attached to   * @@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,  	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); -	/* nothing to do if this cset already belongs to the cgroup */ -	if (src_cgrp == dst_cgrp) -		return; -  	if (!list_empty(&src_cset->mg_preload_node))  		return; @@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,  /**   * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup + * @dst_cgrp: the destination cgroup (may be %NULL)   * @preloaded_csets: list of preloaded source css_sets   *   * Tasks are about to be moved to @dst_cgrp and all the source css_sets   * have been preloaded to @preloaded_csets.  This function looks up and - * pins all destination css_sets, links each to its source, and put them on - * @preloaded_csets. + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy.   *   * This function must be called after cgroup_migrate_add_src() has been   * called on each migration source css_set.  After migration is performed @@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,  				      struct list_head *preloaded_csets)  {  	LIST_HEAD(csets); -	struct css_set *src_cset; +	struct css_set *src_cset, *tmp_cset;  	lockdep_assert_held(&cgroup_mutex); +	/* +	 * Except for the root, child_subsys_mask must be zero for a cgroup +	 * with tasks so that child cgroups don't compete against tasks. +	 */ +	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && +	    dst_cgrp->child_subsys_mask) +		return -EBUSY; +  	/* look up the dst cset for each src cset and link it to src */ -	list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { +	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {  		struct css_set *dst_cset; -		dst_cset = find_css_set(src_cset, dst_cgrp); +		dst_cset = find_css_set(src_cset, +					dst_cgrp ?: src_cset->dfl_cgrp);  		if (!dst_cset)  			goto err;  		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + +		/* +		 * If src cset equals dst, it's noop.  Drop the src. +		 * cgroup_migrate() will skip the cset too.  Note that we +		 * can't handle src == dst as some nodes are used by both. +		 */ +		if (src_cset == dst_cset) { +			src_cset->mg_src_cgrp = NULL; +			list_del_init(&src_cset->mg_preload_node); +			put_css_set(src_cset, false); +			put_css_set(dst_cset, false); +			continue; +		} +  		src_cset->mg_dst_cset = dst_cset;  		if (list_empty(&dst_cset->mg_preload_node)) @@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,  			put_css_set(dst_cset, false);  	} -	list_splice(&csets, preloaded_csets); +	list_splice_tail(&csets, preloaded_csets);  	return 0;  err:  	cgroup_migrate_finish(&csets); @@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,  		return 0;  	/* check that we can legitimately attach to the cgroup */ -	for_each_css(css, i, cgrp) { +	for_each_e_css(css, i, cgrp) {  		if (css->ss->can_attach) {  			ret = css->ss->can_attach(css, &tset);  			if (ret) { @@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,  	 */  	tset.csets = &tset.dst_csets; -	for_each_css(css, i, cgrp) +	for_each_e_css(css, i, cgrp)  		if (css->ss->attach)  			css->ss->attach(css, &tset); @@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,  	goto out_release_tset;  out_cancel_attach: -	for_each_css(css, i, cgrp) { +	for_each_e_css(css, i, cgrp) {  		if (css == failed_css)  			break;  		if (css->ss->cancel_attach) @@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,   * function to attach either it or all tasks in its threadgroup. Will lock   * cgroup_mutex and threadgroup.   */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, +				    size_t nbytes, loff_t off, bool threadgroup)  {  	struct task_struct *tsk;  	const struct cred *cred = current_cred(), *tcred; +	struct cgroup *cgrp; +	pid_t pid;  	int ret; -	if (!cgroup_lock_live_group(cgrp)) +	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) +		return -EINVAL; + +	cgrp = cgroup_kn_lock_live(of->kn); +	if (!cgrp)  		return -ENODEV;  retry_find_task: @@ -2135,8 +2337,8 @@ retry_find_task:  	put_task_struct(tsk);  out_unlock_cgroup: -	mutex_unlock(&cgroup_mutex); -	return ret; +	cgroup_kn_unlock(of->kn); +	return ret ?: nbytes;  }  /** @@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)  }  EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, -			      struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, +				  char *buf, size_t nbytes, loff_t off)  { -	return attach_task_by_pid(css->cgroup, pid, false); +	return __cgroup_procs_write(of, buf, nbytes, off, false);  } -static int cgroup_procs_write(struct cgroup_subsys_state *css, -			      struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, +				  char *buf, size_t nbytes, loff_t off)  { -	return attach_task_by_pid(css->cgroup, tgid, true); +	return __cgroup_procs_write(of, buf, nbytes, off, true);  } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, -				      struct cftype *cft, char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, +					  char *buf, size_t nbytes, loff_t off)  { -	struct cgroup_root *root = css->cgroup->root; +	struct cgroup *cgrp; -	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); -	if (!cgroup_lock_live_group(css->cgroup)) +	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + +	cgrp = cgroup_kn_lock_live(of->kn); +	if (!cgrp)  		return -ENODEV;  	spin_lock(&release_agent_path_lock); -	strlcpy(root->release_agent_path, buffer, -		sizeof(root->release_agent_path)); +	strlcpy(cgrp->root->release_agent_path, strstrip(buf), +		sizeof(cgrp->root->release_agent_path));  	spin_unlock(&release_agent_path_lock); -	mutex_unlock(&cgroup_mutex); -	return 0; +	cgroup_kn_unlock(of->kn); +	return nbytes;  }  static int cgroup_release_agent_show(struct seq_file *seq, void *v)  {  	struct cgroup *cgrp = seq_css(seq)->cgroup; -	if (!cgroup_lock_live_group(cgrp)) -		return -ENODEV; +	spin_lock(&release_agent_path_lock);  	seq_puts(seq, cgrp->root->release_agent_path); +	spin_unlock(&release_agent_path_lock);  	seq_putc(seq, '\n'); -	mutex_unlock(&cgroup_mutex);  	return 0;  } @@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)  	return 0;  } +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ +	struct cgroup_subsys *ss; +	bool printed = false; +	int ssid; + +	for_each_subsys(ss, ssid) { +		if (ss_mask & (1 << ssid)) { +			if (printed) +				seq_putc(seq, ' '); +			seq_printf(seq, "%s", ss->name); +			printed = true; +		} +	} +	if (printed) +		seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) +{ +	struct cgroup *cgrp = seq_css(seq)->cgroup; + +	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & +			     ~cgrp_dfl_root_inhibit_ss_mask); +	return 0; +} + +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ +	struct cgroup *cgrp = seq_css(seq)->cgroup; + +	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); +	return 0; +} + +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ +	struct cgroup *cgrp = seq_css(seq)->cgroup; + +	cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); +	return 0; +} + +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly.  This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. + */ +static int cgroup_update_dfl_csses(struct cgroup *cgrp) +{ +	LIST_HEAD(preloaded_csets); +	struct cgroup_subsys_state *css; +	struct css_set *src_cset; +	int ret; + +	lockdep_assert_held(&cgroup_mutex); + +	/* look up all csses currently attached to @cgrp's subtree */ +	down_read(&css_set_rwsem); +	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { +		struct cgrp_cset_link *link; + +		/* self is not affected by child_subsys_mask change */ +		if (css->cgroup == cgrp) +			continue; + +		list_for_each_entry(link, &css->cgroup->cset_links, cset_link) +			cgroup_migrate_add_src(link->cset, cgrp, +					       &preloaded_csets); +	} +	up_read(&css_set_rwsem); + +	/* NULL dst indicates self on default hierarchy */ +	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); +	if (ret) +		goto out_finish; + +	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { +		struct task_struct *last_task = NULL, *task; + +		/* src_csets precede dst_csets, break on the first dst_cset */ +		if (!src_cset->mg_src_cgrp) +			break; + +		/* +		 * All tasks in src_cset need to be migrated to the +		 * matching dst_cset.  Empty it process by process.  We +		 * walk tasks but migrate processes.  The leader might even +		 * belong to a different cset but such src_cset would also +		 * be among the target src_csets because the default +		 * hierarchy enforces per-process membership. +		 */ +		while (true) { +			down_read(&css_set_rwsem); +			task = list_first_entry_or_null(&src_cset->tasks, +						struct task_struct, cg_list); +			if (task) { +				task = task->group_leader; +				WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); +				get_task_struct(task); +			} +			up_read(&css_set_rwsem); + +			if (!task) +				break; + +			/* guard against possible infinite loop */ +			if (WARN(last_task == task, +				 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) +				goto out_finish; +			last_task = task; + +			threadgroup_lock(task); +			/* raced against de_thread() from another thread? */ +			if (!thread_group_leader(task)) { +				threadgroup_unlock(task); +				put_task_struct(task); +				continue; +			} + +			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + +			threadgroup_unlock(task); +			put_task_struct(task); + +			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) +				goto out_finish; +		} +	} + +out_finish: +	cgroup_migrate_finish(&preloaded_csets); +	return ret; +} + +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, +					    char *buf, size_t nbytes, +					    loff_t off) +{ +	unsigned int enable = 0, disable = 0; +	struct cgroup *cgrp, *child; +	struct cgroup_subsys *ss; +	char *tok; +	int ssid, ret; + +	/* +	 * Parse input - space separated list of subsystem names prefixed +	 * with either + or -. +	 */ +	buf = strstrip(buf); +	while ((tok = strsep(&buf, " "))) { +		if (tok[0] == '\0') +			continue; +		for_each_subsys(ss, ssid) { +			if (ss->disabled || strcmp(tok + 1, ss->name) || +			    ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) +				continue; + +			if (*tok == '+') { +				enable |= 1 << ssid; +				disable &= ~(1 << ssid); +			} else if (*tok == '-') { +				disable |= 1 << ssid; +				enable &= ~(1 << ssid); +			} else { +				return -EINVAL; +			} +			break; +		} +		if (ssid == CGROUP_SUBSYS_COUNT) +			return -EINVAL; +	} + +	cgrp = cgroup_kn_lock_live(of->kn); +	if (!cgrp) +		return -ENODEV; + +	for_each_subsys(ss, ssid) { +		if (enable & (1 << ssid)) { +			if (cgrp->child_subsys_mask & (1 << ssid)) { +				enable &= ~(1 << ssid); +				continue; +			} + +			/* +			 * Because css offlining is asynchronous, userland +			 * might try to re-enable the same controller while +			 * the previous instance is still around.  In such +			 * cases, wait till it's gone using offline_waitq. +			 */ +			cgroup_for_each_live_child(child, cgrp) { +				DEFINE_WAIT(wait); + +				if (!cgroup_css(child, ss)) +					continue; + +				cgroup_get(child); +				prepare_to_wait(&child->offline_waitq, &wait, +						TASK_UNINTERRUPTIBLE); +				cgroup_kn_unlock(of->kn); +				schedule(); +				finish_wait(&child->offline_waitq, &wait); +				cgroup_put(child); + +				return restart_syscall(); +			} + +			/* unavailable or not enabled on the parent? */ +			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || +			    (cgroup_parent(cgrp) && +			     !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { +				ret = -ENOENT; +				goto out_unlock; +			} +		} else if (disable & (1 << ssid)) { +			if (!(cgrp->child_subsys_mask & (1 << ssid))) { +				disable &= ~(1 << ssid); +				continue; +			} + +			/* a child has it enabled? */ +			cgroup_for_each_live_child(child, cgrp) { +				if (child->child_subsys_mask & (1 << ssid)) { +					ret = -EBUSY; +					goto out_unlock; +				} +			} +		} +	} + +	if (!enable && !disable) { +		ret = 0; +		goto out_unlock; +	} + +	/* +	 * Except for the root, child_subsys_mask must be zero for a cgroup +	 * with tasks so that child cgroups don't compete against tasks. +	 */ +	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { +		ret = -EBUSY; +		goto out_unlock; +	} + +	/* +	 * Create csses for enables and update child_subsys_mask.  This +	 * changes cgroup_e_css() results which in turn makes the +	 * subsequent cgroup_update_dfl_csses() associate all tasks in the +	 * subtree to the updated csses. +	 */ +	for_each_subsys(ss, ssid) { +		if (!(enable & (1 << ssid))) +			continue; + +		cgroup_for_each_live_child(child, cgrp) { +			ret = create_css(child, ss); +			if (ret) +				goto err_undo_css; +		} +	} + +	cgrp->child_subsys_mask |= enable; +	cgrp->child_subsys_mask &= ~disable; + +	ret = cgroup_update_dfl_csses(cgrp); +	if (ret) +		goto err_undo_css; + +	/* all tasks are now migrated away from the old csses, kill them */ +	for_each_subsys(ss, ssid) { +		if (!(disable & (1 << ssid))) +			continue; + +		cgroup_for_each_live_child(child, cgrp) +			kill_css(cgroup_css(child, ss)); +	} + +	kernfs_activate(cgrp->kn); +	ret = 0; +out_unlock: +	cgroup_kn_unlock(of->kn); +	return ret ?: nbytes; + +err_undo_css: +	cgrp->child_subsys_mask &= ~enable; +	cgrp->child_subsys_mask |= disable; + +	for_each_subsys(ss, ssid) { +		if (!(enable & (1 << ssid))) +			continue; + +		cgroup_for_each_live_child(child, cgrp) { +			struct cgroup_subsys_state *css = cgroup_css(child, ss); +			if (css) +				kill_css(css); +		} +	} +	goto out_unlock; +} + +static int cgroup_populated_show(struct seq_file *seq, void *v) +{ +	seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); +	return 0; +} +  static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,  				 size_t nbytes, loff_t off)  { @@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,  	struct cgroup_subsys_state *css;  	int ret; +	if (cft->write) +		return cft->write(of, buf, nbytes, off); +  	/*  	 * kernfs guarantees that a file isn't deleted with operations in  	 * flight, which means that the matching css is and stays alive and @@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,  	css = cgroup_css(cgrp, cft->ss);  	rcu_read_unlock(); -	if (cft->write_string) { -		ret = cft->write_string(css, cft, strstrip(buf)); -	} else if (cft->write_u64) { +	if (cft->write_u64) {  		unsigned long long v;  		ret = kstrtoull(buf, 0, &v);  		if (!ret) @@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,  		ret = kstrtoll(buf, 0, &v);  		if (!ret)  			ret = cft->write_s64(css, cft, v); -	} else if (cft->trigger) { -		ret = cft->trigger(css, (unsigned int)cft->private);  	} else {  		ret = -EINVAL;  	} @@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,  		return -EPERM;  	/* -	 * We're gonna grab cgroup_tree_mutex which nests outside kernfs +	 * We're gonna grab cgroup_mutex which nests outside kernfs  	 * active_ref.  kernfs_rename() doesn't require active_ref -	 * protection.  Break them before grabbing cgroup_tree_mutex. +	 * protection.  Break them before grabbing cgroup_mutex.  	 */  	kernfs_break_active_protection(new_parent);  	kernfs_break_active_protection(kn); -	mutex_lock(&cgroup_tree_mutex);  	mutex_lock(&cgroup_mutex);  	ret = kernfs_rename(kn, new_parent, new_name_str);  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex);  	kernfs_unbreak_active_protection(kn);  	kernfs_unbreak_active_protection(new_parent); @@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)  		return PTR_ERR(kn);  	ret = cgroup_kn_set_ugid(kn); -	if (ret) +	if (ret) {  		kernfs_remove(kn); -	return ret; +		return ret; +	} + +	if (cft->seq_show == cgroup_populated_show) +		cgrp->populated_kn = kn; +	return 0;  }  /** @@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],  	struct cftype *cft;  	int ret; -	lockdep_assert_held(&cgroup_tree_mutex); +	lockdep_assert_held(&cgroup_mutex);  	for (cft = cfts; cft->name[0] != '\0'; cft++) {  		/* does cft->flags tell us to skip this file on @cgrp? */ @@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],  			continue;  		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))  			continue; -		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) +		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))  			continue; -		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) +		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))  			continue;  		if (is_add) {  			ret = cgroup_add_file(cgrp, cft);  			if (ret) { -				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", -					cft->name, ret); +				pr_warn("%s: failed to add %s, err=%d\n", +					__func__, cft->name, ret);  				return ret;  			}  		} else { @@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)  	struct cgroup_subsys_state *css;  	int ret = 0; -	lockdep_assert_held(&cgroup_tree_mutex); - -	/* don't bother if @ss isn't attached */ -	if (ss->root == &cgrp_dfl_root) -		return 0; +	lockdep_assert_held(&cgroup_mutex);  	/* add/rm files for all cgroups created before */  	css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  static int cgroup_rm_cftypes_locked(struct cftype *cfts)  { -	lockdep_assert_held(&cgroup_tree_mutex); +	lockdep_assert_held(&cgroup_mutex);  	if (!cfts || !cfts[0].ss)  		return -ENOENT; @@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)  {  	int ret; -	mutex_lock(&cgroup_tree_mutex); +	mutex_lock(&cgroup_mutex);  	ret = cgroup_rm_cftypes_locked(cfts); -	mutex_unlock(&cgroup_tree_mutex); +	mutex_unlock(&cgroup_mutex);  	return ret;  } @@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  {  	int ret; +	if (ss->disabled) +		return 0; +  	if (!cfts || cfts[0].name[0] == '\0')  		return 0; @@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  	if (ret)  		return ret; -	mutex_lock(&cgroup_tree_mutex); +	mutex_lock(&cgroup_mutex);  	list_add_tail(&cfts->node, &ss->cfts);  	ret = cgroup_apply_cftypes(cfts, true);  	if (ret)  		cgroup_rm_cftypes_locked(cfts); -	mutex_unlock(&cgroup_tree_mutex); +	mutex_unlock(&cgroup_mutex);  	return ret;  } @@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)  /**   * css_next_child - find the next child of a given css - * @pos_css: the current position (%NULL to initiate traversal) - * @parent_css: css whose children to walk + * @pos: the current position (%NULL to initiate traversal) + * @parent: css whose children to walk   * - * This function returns the next child of @parent_css and should be called + * This function returns the next child of @parent and should be called   * under either cgroup_mutex or RCU read lock.  The only requirement is - * that @parent_css and @pos_css are accessible.  The next sibling is - * guaranteed to be returned regardless of their states. + * that @parent and @pos are accessible.  The next sibling is guaranteed to + * be returned regardless of their states. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal.  It's each subsystem's + * responsibility to synchronize against on/offlining.   */ -struct cgroup_subsys_state * -css_next_child(struct cgroup_subsys_state *pos_css, -	       struct cgroup_subsys_state *parent_css) +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, +					   struct cgroup_subsys_state *parent)  { -	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; -	struct cgroup *cgrp = parent_css->cgroup; -	struct cgroup *next; +	struct cgroup_subsys_state *next; -	cgroup_assert_mutexes_or_rcu_locked(); +	cgroup_assert_mutex_or_rcu_locked();  	/* -	 * @pos could already have been removed.  Once a cgroup is removed, -	 * its ->sibling.next is no longer updated when its next sibling -	 * changes.  As CGRP_DEAD assertion is serialized and happens -	 * before the cgroup is taken off the ->sibling list, if we see it -	 * unasserted, it's guaranteed that the next sibling hasn't -	 * finished its grace period even if it's already removed, and thus -	 * safe to dereference from this RCU critical section.  If -	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed -	 * to be visible as %true here. +	 * @pos could already have been unlinked from the sibling list. +	 * Once a cgroup is removed, its ->sibling.next is no longer +	 * updated when its next sibling changes.  CSS_RELEASED is set when +	 * @pos is taken off list, at which time its next pointer is valid, +	 * and, as releases are serialized, the one pointed to by the next +	 * pointer is guaranteed to not have started release yet.  This +	 * implies that if we observe !CSS_RELEASED on @pos in this RCU +	 * critical section, the one pointed to by its next pointer is +	 * guaranteed to not have finished its RCU grace period even if we +	 * have dropped rcu_read_lock() inbetween iterations.  	 * -	 * If @pos is dead, its next pointer can't be dereferenced; -	 * however, as each cgroup is given a monotonically increasing -	 * unique serial number and always appended to the sibling list, -	 * the next one can be found by walking the parent's children until -	 * we see a cgroup with higher serial number than @pos's.  While -	 * this path can be slower, it's taken only when either the current -	 * cgroup is removed or iteration and removal race. +	 * If @pos has CSS_RELEASED set, its next pointer can't be +	 * dereferenced; however, as each css is given a monotonically +	 * increasing unique serial number and always appended to the +	 * sibling list, the next one can be found by walking the parent's +	 * children until the first css with higher serial number than +	 * @pos's.  While this path can be slower, it happens iff iteration +	 * races against release and the race window is very small.  	 */  	if (!pos) { -		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); -	} else if (likely(!cgroup_is_dead(pos))) { -		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); +		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); +	} else if (likely(!(pos->flags & CSS_RELEASED))) { +		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);  	} else { -		list_for_each_entry_rcu(next, &cgrp->children, sibling) +		list_for_each_entry_rcu(next, &parent->children, sibling)  			if (next->serial_nr > pos->serial_nr)  				break;  	} -	if (&next->sibling == &cgrp->children) -		return NULL; - -	return cgroup_css(next, parent_css->ss); +	/* +	 * @next, if not pointing to the head, can be dereferenced and is +	 * the next sibling. +	 */ +	if (&next->sibling != &parent->children) +		return next; +	return NULL;  }  /** @@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,   * doesn't require the whole traversal to be contained in a single critical   * section.  This function will return the correct next descendant as long   * as both @pos and @root are accessible and @pos is a descendant of @root. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal.  It's each subsystem's + * responsibility to synchronize against on/offlining.   */  struct cgroup_subsys_state *  css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,  {  	struct cgroup_subsys_state *next; -	cgroup_assert_mutexes_or_rcu_locked(); +	cgroup_assert_mutex_or_rcu_locked();  	/* if first iteration, visit @root */  	if (!pos) @@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,  	/* no child, visit my or the closest ancestor's next sibling */  	while (pos != root) { -		next = css_next_child(pos, css_parent(pos)); +		next = css_next_child(pos, pos->parent);  		if (next)  			return next; -		pos = css_parent(pos); +		pos = pos->parent;  	}  	return NULL; @@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)  {  	struct cgroup_subsys_state *last, *tmp; -	cgroup_assert_mutexes_or_rcu_locked(); +	cgroup_assert_mutex_or_rcu_locked();  	do {  		last = pos; @@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)   * section.  This function will return the correct next descendant as long   * as both @pos and @cgroup are accessible and @pos is a descendant of   * @cgroup. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal.  It's each subsystem's + * responsibility to synchronize against on/offlining.   */  struct cgroup_subsys_state *  css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,  {  	struct cgroup_subsys_state *next; -	cgroup_assert_mutexes_or_rcu_locked(); +	cgroup_assert_mutex_or_rcu_locked();  	/* if first iteration, visit leftmost descendant which may be @root */  	if (!pos) @@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,  		return NULL;  	/* if there's an unvisited sibling, visit its leftmost descendant */ -	next = css_next_child(pos, css_parent(pos)); +	next = css_next_child(pos, pos->parent);  	if (next)  		return css_leftmost_descendant(next);  	/* no sibling left, visit parent */ -	return css_parent(pos); +	return pos->parent; +} + +/** + * css_has_online_children - does a css have online children + * @css: the target css + * + * Returns %true if @css has any online children; otherwise, %false.  This + * function can be called from any context but the caller is responsible + * for synchronizing against on/offlining as necessary. + */ +bool css_has_online_children(struct cgroup_subsys_state *css) +{ +	struct cgroup_subsys_state *child; +	bool ret = false; + +	rcu_read_lock(); +	css_for_each_child(child, css) { +		if (css->flags & CSS_ONLINE) { +			ret = true; +			break; +		} +	} +	rcu_read_unlock(); +	return ret;  }  /** @@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,   */  static void css_advance_task_iter(struct css_task_iter *it)  { -	struct list_head *l = it->cset_link; +	struct list_head *l = it->cset_pos;  	struct cgrp_cset_link *link;  	struct css_set *cset;  	/* Advance to the next non-empty css_set */  	do {  		l = l->next; -		if (l == &it->origin_css->cgroup->cset_links) { -			it->cset_link = NULL; +		if (l == it->cset_head) { +			it->cset_pos = NULL;  			return;  		} -		link = list_entry(l, struct cgrp_cset_link, cset_link); -		cset = link->cset; + +		if (it->ss) { +			cset = container_of(l, struct css_set, +					    e_cset_node[it->ss->id]); +		} else { +			link = list_entry(l, struct cgrp_cset_link, cset_link); +			cset = link->cset; +		}  	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); -	it->cset_link = l; +	it->cset_pos = l;  	if (!list_empty(&cset->tasks)) -		it->task = cset->tasks.next; +		it->task_pos = cset->tasks.next;  	else -		it->task = cset->mg_tasks.next; +		it->task_pos = cset->mg_tasks.next; + +	it->tasks_head = &cset->tasks; +	it->mg_tasks_head = &cset->mg_tasks;  }  /** @@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,  	down_read(&css_set_rwsem); -	it->origin_css = css; -	it->cset_link = &css->cgroup->cset_links; +	it->ss = css->ss; + +	if (it->ss) +		it->cset_pos = &css->cgroup->e_csets[css->ss->id]; +	else +		it->cset_pos = &css->cgroup->cset_links; + +	it->cset_head = it->cset_pos;  	css_advance_task_iter(it);  } @@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,  struct task_struct *css_task_iter_next(struct css_task_iter *it)  {  	struct task_struct *res; -	struct list_head *l = it->task; -	struct cgrp_cset_link *link = list_entry(it->cset_link, -					struct cgrp_cset_link, cset_link); +	struct list_head *l = it->task_pos;  	/* If the iterator cg is NULL, we have no tasks */ -	if (!it->cset_link) +	if (!it->cset_pos)  		return NULL;  	res = list_entry(l, struct task_struct, cg_list); @@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  	 */  	l = l->next; -	if (l == &link->cset->tasks) -		l = link->cset->mg_tasks.next; +	if (l == it->tasks_head) +		l = it->mg_tasks_head->next; -	if (l == &link->cset->mg_tasks) +	if (l == it->mg_tasks_head)  		css_advance_task_iter(it);  	else -		it->task = l; +		it->task_pos = l;  	return res;  } @@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  	 * ->can_attach() fails.  	 */  	do { -		css_task_iter_start(&from->dummy_css, &it); +		css_task_iter_start(&from->self, &it);  		task = css_task_iter_next(&it);  		if (task)  			get_task_struct(task); @@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,  	if (!array)  		return -ENOMEM;  	/* now, populate the array */ -	css_task_iter_start(&cgrp->dummy_css, &it); +	css_task_iter_start(&cgrp->self, &it);  	while ((tsk = css_task_iter_next(&it))) {  		if (unlikely(n == length))  			break; @@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)  	/*  	 * We aren't being called from kernfs and there's no guarantee on -	 * @kn->priv's validity.  For this and css_tryget_from_dir(), +	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),  	 * @kn->priv is RCU safe.  Let's do the RCU dancing.  	 */  	rcu_read_lock(); @@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)  	}  	rcu_read_unlock(); -	css_task_iter_start(&cgrp->dummy_css, &it); +	css_task_iter_start(&cgrp->self, &it);  	while ((tsk = css_task_iter_next(&it))) {  		switch (tsk->state) {  		case TASK_RUNNING: @@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)  	return seq_printf(s, "%d\n", *(int *)v);  } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { -	.start = cgroup_pidlist_start, -	.stop = cgroup_pidlist_stop, -	.next = cgroup_pidlist_next, -	.show = cgroup_pidlist_show, -}; -  static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,  					 struct cftype *cft)  { @@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = {  		.seq_stop = cgroup_pidlist_stop,  		.seq_show = cgroup_pidlist_show,  		.private = CGROUP_FILE_PROCS, -		.write_u64 = cgroup_procs_write, +		.write = cgroup_procs_write,  		.mode = S_IRUGO | S_IWUSR,  	},  	{ @@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = {  		.flags = CFTYPE_ONLY_ON_ROOT,  		.seq_show = cgroup_sane_behavior_show,  	}, +	{ +		.name = "cgroup.controllers", +		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, +		.seq_show = cgroup_root_controllers_show, +	}, +	{ +		.name = "cgroup.controllers", +		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, +		.seq_show = cgroup_controllers_show, +	}, +	{ +		.name = "cgroup.subtree_control", +		.flags = CFTYPE_ONLY_ON_DFL, +		.seq_show = cgroup_subtree_control_show, +		.write = cgroup_subtree_control_write, +	}, +	{ +		.name = "cgroup.populated", +		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, +		.seq_show = cgroup_populated_show, +	},  	/*  	 * Historical crazy stuff.  These don't have "cgroup."  prefix and @@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = {  		.seq_stop = cgroup_pidlist_stop,  		.seq_show = cgroup_pidlist_show,  		.private = CGROUP_FILE_TASKS, -		.write_u64 = cgroup_tasks_write, +		.write = cgroup_tasks_write,  		.mode = S_IRUGO | S_IWUSR,  	},  	{ @@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = {  		.name = "release_agent",  		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,  		.seq_show = cgroup_release_agent_show, -		.write_string = cgroup_release_agent_write, +		.write = cgroup_release_agent_write,  		.max_write_len = PATH_MAX - 1,  	},  	{ }	/* terminate */ @@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = {   *   * On failure, no file is added.   */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)  {  	struct cgroup_subsys *ss;  	int i, ret = 0; @@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)  	for_each_subsys(ss, i) {  		struct cftype *cfts; -		if (!test_bit(i, &subsys_mask)) +		if (!(subsys_mask & (1 << i)))  			continue;  		list_for_each_entry(cfts, &ss->cfts, node) { @@ -3525,9 +4112,9 @@ err:   *    Implemented in kill_css().   *   * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - *    and thus css_tryget() is guaranteed to fail, the css can be offlined - *    by invoking offline_css().  After offlining, the base ref is put. - *    Implemented in css_killed_work_fn(). + *    and thus css_tryget_online() is guaranteed to fail, the css can be + *    offlined by invoking offline_css().  After offlining, the base ref is + *    put.  Implemented in css_killed_work_fn().   *   * 3. When the percpu_ref reaches zero, the only possible remaining   *    accessors are inside RCU read sections.  css_release() schedules the @@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work)  		container_of(work, struct cgroup_subsys_state, destroy_work);  	struct cgroup *cgrp = css->cgroup; -	if (css->parent) -		css_put(css->parent); +	if (css->ss) { +		/* css free path */ +		if (css->parent) +			css_put(css->parent); -	css->ss->css_free(css); -	cgroup_put(cgrp); +		css->ss->css_free(css); +		cgroup_put(cgrp); +	} else { +		/* cgroup free path */ +		atomic_dec(&cgrp->root->nr_cgrps); +		cgroup_pidlist_destroy_all(cgrp); + +		if (cgroup_parent(cgrp)) { +			/* +			 * We get a ref to the parent, and put the ref when +			 * this cgroup is being freed, so it's guaranteed +			 * that the parent won't be destroyed before its +			 * children. +			 */ +			cgroup_put(cgroup_parent(cgrp)); +			kernfs_put(cgrp->kn); +			kfree(cgrp); +		} else { +			/* +			 * This is root cgroup's refcnt reaching zero, +			 * which indicates that the root should be +			 * released. +			 */ +			cgroup_destroy_root(cgrp->root); +		} +	}  }  static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)  	queue_work(cgroup_destroy_wq, &css->destroy_work);  } +static void css_release_work_fn(struct work_struct *work) +{ +	struct cgroup_subsys_state *css = +		container_of(work, struct cgroup_subsys_state, destroy_work); +	struct cgroup_subsys *ss = css->ss; +	struct cgroup *cgrp = css->cgroup; + +	mutex_lock(&cgroup_mutex); + +	css->flags |= CSS_RELEASED; +	list_del_rcu(&css->sibling); + +	if (ss) { +		/* css release path */ +		cgroup_idr_remove(&ss->css_idr, css->id); +	} else { +		/* cgroup release path */ +		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); +		cgrp->id = -1; +	} + +	mutex_unlock(&cgroup_mutex); + +	call_rcu(&css->rcu_head, css_free_rcu_fn); +} +  static void css_release(struct percpu_ref *ref)  {  	struct cgroup_subsys_state *css =  		container_of(ref, struct cgroup_subsys_state, refcnt); -	RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); -	call_rcu(&css->rcu_head, css_free_rcu_fn); +	INIT_WORK(&css->destroy_work, css_release_work_fn); +	queue_work(cgroup_destroy_wq, &css->destroy_work);  } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, -		     struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, +			      struct cgroup_subsys *ss, struct cgroup *cgrp)  { +	lockdep_assert_held(&cgroup_mutex); + +	cgroup_get(cgrp); + +	memset(css, 0, sizeof(*css));  	css->cgroup = cgrp;  	css->ss = ss; -	css->flags = 0; +	INIT_LIST_HEAD(&css->sibling); +	INIT_LIST_HEAD(&css->children); +	css->serial_nr = css_serial_nr_next++; -	if (cgrp->parent) -		css->parent = cgroup_css(cgrp->parent, ss); -	else -		css->flags |= CSS_ROOT; +	if (cgroup_parent(cgrp)) { +		css->parent = cgroup_css(cgroup_parent(cgrp), ss); +		css_get(css->parent); +	}  	BUG_ON(cgroup_css(cgrp, ss));  } @@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css)  	struct cgroup_subsys *ss = css->ss;  	int ret = 0; -	lockdep_assert_held(&cgroup_tree_mutex);  	lockdep_assert_held(&cgroup_mutex);  	if (ss->css_online)  		ret = ss->css_online(css);  	if (!ret) {  		css->flags |= CSS_ONLINE; -		css->cgroup->nr_css++;  		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);  	}  	return ret; @@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css)  {  	struct cgroup_subsys *ss = css->ss; -	lockdep_assert_held(&cgroup_tree_mutex);  	lockdep_assert_held(&cgroup_mutex);  	if (!(css->flags & CSS_ONLINE)) @@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css)  		ss->css_offline(css);  	css->flags &= ~CSS_ONLINE; -	css->cgroup->nr_css--; -	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); +	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + +	wake_up_all(&css->cgroup->offline_waitq);  }  /** @@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css)   */  static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)  { -	struct cgroup *parent = cgrp->parent; +	struct cgroup *parent = cgroup_parent(cgrp); +	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);  	struct cgroup_subsys_state *css;  	int err;  	lockdep_assert_held(&cgroup_mutex); -	css = ss->css_alloc(cgroup_css(parent, ss)); +	css = ss->css_alloc(parent_css);  	if (IS_ERR(css))  		return PTR_ERR(css); +	init_and_link_css(css, ss, cgrp); +  	err = percpu_ref_init(&css->refcnt, css_release);  	if (err)  		goto err_free_css; -	init_css(css, ss, cgrp); +	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); +	if (err < 0) +		goto err_free_percpu_ref; +	css->id = err;  	err = cgroup_populate_dir(cgrp, 1 << ss->id);  	if (err) -		goto err_free_percpu_ref; +		goto err_free_id; + +	/* @css is ready to be brought online now, make it visible */ +	list_add_tail_rcu(&css->sibling, &parent_css->children); +	cgroup_idr_replace(&ss->css_idr, css, css->id);  	err = online_css(css);  	if (err) -		goto err_clear_dir; - -	cgroup_get(cgrp); -	css_get(css->parent); - -	cgrp->subsys_mask |= 1 << ss->id; +		goto err_list_del;  	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && -	    parent->parent) { -		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", -			   current->comm, current->pid, ss->name); +	    cgroup_parent(parent)) { +		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", +			current->comm, current->pid, ss->name);  		if (!strcmp(ss->name, "memory")) -			pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); +			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");  		ss->warned_broken_hierarchy = true;  	}  	return 0; -err_clear_dir: +err_list_del: +	list_del_rcu(&css->sibling);  	cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: +	cgroup_idr_remove(&ss->css_idr, css->id);  err_free_percpu_ref:  	percpu_ref_cancel_init(&css->refcnt);  err_free_css: -	ss->css_free(css); +	call_rcu(&css->rcu_head, css_free_rcu_fn);  	return err;  } -/** - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @name: name of the new cgroup - * @mode: mode to set on new cgroup - */ -static long cgroup_create(struct cgroup *parent, const char *name, -			  umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, +			umode_t mode)  { -	struct cgroup *cgrp; -	struct cgroup_root *root = parent->root; -	int ssid, err; +	struct cgroup *parent, *cgrp; +	struct cgroup_root *root;  	struct cgroup_subsys *ss;  	struct kernfs_node *kn; +	int ssid, ret; -	/* -	 * XXX: The default hierarchy isn't fully implemented yet.  Block -	 * !root cgroup creation on it for now. -	 */ -	if (root == &cgrp_dfl_root) -		return -EINVAL; +	parent = cgroup_kn_lock_live(parent_kn); +	if (!parent) +		return -ENODEV; +	root = parent->root;  	/* allocate the cgroup and its ID, 0 is reserved for the root */  	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); -	if (!cgrp) -		return -ENOMEM; - -	mutex_lock(&cgroup_tree_mutex); - -	/* -	 * Only live parents can have children.  Note that the liveliness -	 * check isn't strictly necessary because cgroup_mkdir() and -	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it -	 * anyway so that locking is contained inside cgroup proper and we -	 * don't get nasty surprises if we ever grow another caller. -	 */ -	if (!cgroup_lock_live_group(parent)) { -		err = -ENODEV; -		goto err_unlock_tree; +	if (!cgrp) { +		ret = -ENOMEM; +		goto out_unlock;  	} +	ret = percpu_ref_init(&cgrp->self.refcnt, css_release); +	if (ret) +		goto out_free_cgrp; +  	/*  	 * Temporarily set the pointer to NULL, so idr_find() won't return  	 * a half-baked cgroup.  	 */ -	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); +	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);  	if (cgrp->id < 0) { -		err = -ENOMEM; -		goto err_unlock; +		ret = -ENOMEM; +		goto out_cancel_ref;  	}  	init_cgroup_housekeeping(cgrp); -	cgrp->parent = parent; -	cgrp->dummy_css.parent = &parent->dummy_css; -	cgrp->root = parent->root; +	cgrp->self.parent = &parent->self; +	cgrp->root = root;  	if (notify_on_release(parent))  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,  	/* create the directory */  	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);  	if (IS_ERR(kn)) { -		err = PTR_ERR(kn); -		goto err_free_id; +		ret = PTR_ERR(kn); +		goto out_free_id;  	}  	cgrp->kn = kn; @@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,  	 */  	kernfs_get(kn); -	cgrp->serial_nr = cgroup_serial_nr_next++; +	cgrp->self.serial_nr = css_serial_nr_next++;  	/* allocation complete, commit to creation */ -	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); +	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);  	atomic_inc(&root->nr_cgrps);  	cgroup_get(parent); @@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,  	 * @cgrp is now fully operational.  If something fails after this  	 * point, it'll be released via the normal destruction path.  	 */ -	idr_replace(&root->cgroup_idr, cgrp, cgrp->id); +	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); -	err = cgroup_kn_set_ugid(kn); -	if (err) -		goto err_destroy; +	ret = cgroup_kn_set_ugid(kn); +	if (ret) +		goto out_destroy; -	err = cgroup_addrm_files(cgrp, cgroup_base_files, true); -	if (err) -		goto err_destroy; +	ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); +	if (ret) +		goto out_destroy;  	/* let's create and online css's */  	for_each_subsys(ss, ssid) { -		if (root->cgrp.subsys_mask & (1 << ssid)) { -			err = create_css(cgrp, ss); -			if (err) -				goto err_destroy; +		if (parent->child_subsys_mask & (1 << ssid)) { +			ret = create_css(cgrp, ss); +			if (ret) +				goto out_destroy;  		}  	} -	kernfs_activate(kn); +	/* +	 * On the default hierarchy, a child doesn't automatically inherit +	 * child_subsys_mask from the parent.  Each is configured manually. +	 */ +	if (!cgroup_on_dfl(cgrp)) +		cgrp->child_subsys_mask = parent->child_subsys_mask; -	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex); +	kernfs_activate(kn); -	return 0; +	ret = 0; +	goto out_unlock; -err_free_id: -	idr_remove(&root->cgroup_idr, cgrp->id); -err_unlock: -	mutex_unlock(&cgroup_mutex); -err_unlock_tree: -	mutex_unlock(&cgroup_tree_mutex); +out_free_id: +	cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_cancel_ref: +	percpu_ref_cancel_init(&cgrp->self.refcnt); +out_free_cgrp:  	kfree(cgrp); -	return err; +out_unlock: +	cgroup_kn_unlock(parent_kn); +	return ret; -err_destroy: +out_destroy:  	cgroup_destroy_locked(cgrp); -	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex); -	return err; -} - -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, -			umode_t mode) -{ -	struct cgroup *parent = parent_kn->priv; -	int ret; - -	/* -	 * cgroup_create() grabs cgroup_tree_mutex which nests outside -	 * kernfs active_ref and cgroup_create() already synchronizes -	 * properly against removal through cgroup_lock_live_group(). -	 * Break it before calling cgroup_create(). -	 */ -	cgroup_get(parent); -	kernfs_break_active_protection(parent_kn); - -	ret = cgroup_create(parent, name, mode); - -	kernfs_unbreak_active_protection(parent_kn); -	cgroup_put(parent); -	return ret; +	goto out_unlock;  }  /*   * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to + * initate destruction and put the css ref from kill_css().   */  static void css_killed_work_fn(struct work_struct *work)  {  	struct cgroup_subsys_state *css =  		container_of(work, struct cgroup_subsys_state, destroy_work); -	struct cgroup *cgrp = css->cgroup; -	mutex_lock(&cgroup_tree_mutex);  	mutex_lock(&cgroup_mutex); - -	/* -	 * css_tryget() is guaranteed to fail now.  Tell subsystems to -	 * initate destruction. -	 */  	offline_css(css); - -	/* -	 * If @cgrp is marked dead, it's waiting for refs of all css's to -	 * be disabled before proceeding to the second phase of cgroup -	 * destruction.  If we are the last one, kick it off. -	 */ -	if (!cgrp->nr_css && cgroup_is_dead(cgrp)) -		cgroup_destroy_css_killed(cgrp); -  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex); -	/* -	 * Put the css refs from kill_css().  Each css holds an extra -	 * reference to the cgroup's dentry and cgroup removal proceeds -	 * regardless of css refs.  On the last put of each css, whenever -	 * that may be, the extra dentry ref is put so that dentry -	 * destruction happens only after all css's are released. -	 */  	css_put(css);  } @@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)  	queue_work(cgroup_destroy_wq, &css->destroy_work);  } -static void __kill_css(struct cgroup_subsys_state *css) +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference.  ->css_offline() will be invoked + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css)  { -	lockdep_assert_held(&cgroup_tree_mutex); +	lockdep_assert_held(&cgroup_mutex);  	/*  	 * This must happen before css is disassociated with its cgroup. @@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css)  	/*  	 * cgroup core guarantees that, by the time ->css_offline() is  	 * invoked, no new css reference will be given out via -	 * css_tryget().  We can't simply call percpu_ref_kill() and +	 * css_tryget_online().  We can't simply call percpu_ref_kill() and  	 * proceed to offlining css's because percpu_ref_kill() doesn't  	 * guarantee that the ref is seen as killed on all CPUs on return.  	 * @@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css)  }  /** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference.  ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) -{ -	struct cgroup *cgrp = css->cgroup; - -	lockdep_assert_held(&cgroup_tree_mutex); - -	/* if already killed, noop */ -	if (cgrp->subsys_mask & (1 << css->ss->id)) { -		cgrp->subsys_mask &= ~(1 << css->ss->id); -		__kill_css(css); -	} -} - -/**   * cgroup_destroy_locked - the first stage of cgroup destruction   * @cgrp: cgroup to be destroyed   *   * css's make use of percpu refcnts whose killing latency shouldn't be   * exposed to userland and are RCU protected.  Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked.  To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked.  To satisfy all the requirements, + * destruction is implemented in the following two steps.   *   * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all   *     userland visible parts and start killing the percpu refcnts of @@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css)  static int cgroup_destroy_locked(struct cgroup *cgrp)  	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)  { -	struct cgroup *child;  	struct cgroup_subsys_state *css;  	bool empty;  	int ssid; -	lockdep_assert_held(&cgroup_tree_mutex);  	lockdep_assert_held(&cgroup_mutex);  	/* @@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  		return -EBUSY;  	/* -	 * Make sure there's no live children.  We can't test ->children -	 * emptiness as dead children linger on it while being destroyed; -	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. +	 * Make sure there's no live children.  We can't test emptiness of +	 * ->self.children as dead children linger on it while being +	 * drained; otherwise, "rmdir parent/child parent" may fail.  	 */ -	empty = true; -	rcu_read_lock(); -	list_for_each_entry_rcu(child, &cgrp->children, sibling) { -		empty = cgroup_is_dead(child); -		if (!empty) -			break; -	} -	rcu_read_unlock(); -	if (!empty) +	if (css_has_online_children(&cgrp->self))  		return -EBUSY;  	/*  	 * Mark @cgrp dead.  This prevents further task migration and child -	 * creation by disabling cgroup_lock_live_group().  Note that -	 * CGRP_DEAD assertion is depended upon by css_next_child() to -	 * resume iteration after dropping RCU read lock.  See -	 * css_next_child() for details. +	 * creation by disabling cgroup_lock_live_group().  	 */ -	set_bit(CGRP_DEAD, &cgrp->flags); +	cgrp->self.flags &= ~CSS_ONLINE; -	/* -	 * Initiate massacre of all css's.  cgroup_destroy_css_killed() -	 * will be invoked to perform the rest of destruction once the -	 * percpu refs of all css's are confirmed to be killed.  This -	 * involves removing the subsystem's files, drop cgroup_mutex. -	 */ -	mutex_unlock(&cgroup_mutex); +	/* initiate massacre of all css's */  	for_each_css(css, ssid, cgrp)  		kill_css(css); -	mutex_lock(&cgroup_mutex); -	/* CGRP_DEAD is set, remove from ->release_list for the last time */ +	/* CSS_ONLINE is clear, remove from ->release_list for the last time */  	raw_spin_lock(&release_list_lock);  	if (!list_empty(&cgrp->release_list))  		list_del_init(&cgrp->release_list);  	raw_spin_unlock(&release_list_lock);  	/* -	 * If @cgrp has css's attached, the second stage of cgroup -	 * destruction is kicked off from css_killed_work_fn() after the -	 * refs of all attached css's are killed.  If @cgrp doesn't have -	 * any css, we kick it off here. +	 * Remove @cgrp directory along with the base files.  @cgrp has an +	 * extra ref on its kn.  	 */ -	if (!cgrp->nr_css) -		cgroup_destroy_css_killed(cgrp); - -	/* remove @cgrp directory along with the base files */ -	mutex_unlock(&cgroup_mutex); +	kernfs_remove(cgrp->kn); -	/* -	 * There are two control paths which try to determine cgroup from -	 * dentry without going through kernfs - cgroupstats_build() and -	 * css_tryget_from_dir().  Those are supported by RCU protecting -	 * clearing of cgrp->kn->priv backpointer, which should happen -	 * after all files under it have been removed. -	 */ -	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */ -	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); +	set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); +	check_for_release(cgroup_parent(cgrp)); -	mutex_lock(&cgroup_mutex); +	/* put the base reference */ +	percpu_ref_kill(&cgrp->self.refcnt);  	return 0;  }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction.  This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) -{ -	struct cgroup *parent = cgrp->parent; - -	lockdep_assert_held(&cgroup_tree_mutex); -	lockdep_assert_held(&cgroup_mutex); - -	/* delete this cgroup from parent->children */ -	list_del_rcu(&cgrp->sibling); - -	cgroup_put(cgrp); - -	set_bit(CGRP_RELEASABLE, &parent->flags); -	check_for_release(parent); -} -  static int cgroup_rmdir(struct kernfs_node *kn)  { -	struct cgroup *cgrp = kn->priv; +	struct cgroup *cgrp;  	int ret = 0; -	/* -	 * This is self-destruction but @kn can't be removed while this -	 * callback is in progress.  Let's break active protection.  Once -	 * the protection is broken, @cgrp can be destroyed at any point. -	 * Pin it so that it stays accessible. -	 */ -	cgroup_get(cgrp); -	kernfs_break_active_protection(kn); +	cgrp = cgroup_kn_lock_live(kn); +	if (!cgrp) +		return 0; +	cgroup_get(cgrp);	/* for @kn->priv clearing */ -	mutex_lock(&cgroup_tree_mutex); -	mutex_lock(&cgroup_mutex); +	ret = cgroup_destroy_locked(cgrp); + +	cgroup_kn_unlock(kn);  	/* -	 * @cgrp might already have been destroyed while we're trying to -	 * grab the mutexes. +	 * There are two control paths which try to determine cgroup from +	 * dentry without going through kernfs - cgroupstats_build() and +	 * css_tryget_online_from_dir().  Those are supported by RCU +	 * protecting clearing of cgrp->kn->priv backpointer, which should +	 * happen after all files under it have been removed.  	 */ -	if (!cgroup_is_dead(cgrp)) -		ret = cgroup_destroy_locked(cgrp); - -	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex); +	if (!ret) +		RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); -	kernfs_unbreak_active_protection(kn);  	cgroup_put(cgrp);  	return ret;  } @@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {  	.rename			= cgroup_rename,  }; -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)  {  	struct cgroup_subsys_state *css;  	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); -	mutex_lock(&cgroup_tree_mutex);  	mutex_lock(&cgroup_mutex); +	idr_init(&ss->css_idr);  	INIT_LIST_HEAD(&ss->cfts);  	/* Create the root cgroup state for this subsystem */ @@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));  	/* We don't handle early failures gracefully */  	BUG_ON(IS_ERR(css)); -	init_css(css, ss, &cgrp_dfl_root.cgrp); +	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + +	/* +	 * Root csses are never destroyed and we can't initialize +	 * percpu_ref during early init.  Disable refcnting. +	 */ +	css->flags |= CSS_NO_REF; + +	if (early) { +		/* allocation can't be done safely during early init */ +		css->id = 1; +	} else { +		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); +		BUG_ON(css->id < 0); +	}  	/* Update the init_css_set to contain a subsys  	 * pointer to this state - since the subsystem is @@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  	BUG_ON(online_css(css)); -	cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; -  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex);  }  /** @@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void)  	int i;  	init_cgroup_root(&cgrp_dfl_root, &opts); +	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; +  	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);  	for_each_subsys(ss, i) { @@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void)  		ss->name = cgroup_subsys_name[i];  		if (ss->early_init) -			cgroup_init_subsys(ss); +			cgroup_init_subsys(ss, true);  	}  	return 0;  } @@ -4202,7 +4735,6 @@ int __init cgroup_init(void)  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); -	mutex_lock(&cgroup_tree_mutex);  	mutex_lock(&cgroup_mutex);  	/* Add init_css_set to the hash table */ @@ -4212,18 +4744,31 @@ int __init cgroup_init(void)  	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));  	mutex_unlock(&cgroup_mutex); -	mutex_unlock(&cgroup_tree_mutex);  	for_each_subsys(ss, ssid) { -		if (!ss->early_init) -			cgroup_init_subsys(ss); +		if (ss->early_init) { +			struct cgroup_subsys_state *css = +				init_css_set.subsys[ss->id]; + +			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, +						   GFP_KERNEL); +			BUG_ON(css->id < 0); +		} else { +			cgroup_init_subsys(ss, false); +		} + +		list_add_tail(&init_css_set.e_cset_node[ssid], +			      &cgrp_dfl_root.cgrp.e_csets[ssid]);  		/* -		 * cftype registration needs kmalloc and can't be done -		 * during early_init.  Register base cftypes separately. +		 * Setting dfl_root subsys_mask needs to consider the +		 * disabled flag and cftype registration needs kmalloc, +		 * both of which aren't available during early_init.  		 */ -		if (ss->base_cftypes) +		if (!ss->disabled) { +			cgrp_dfl_root.subsys_mask |= 1 << ss->id;  			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); +		}  	}  	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)  		seq_printf(m, "%d:", root->hierarchy_id);  		for_each_subsys(ss, ssid) -			if (root->cgrp.subsys_mask & (1 << ssid)) +			if (root->subsys_mask & (1 << ssid))  				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);  		if (strlen(root->name))  			seq_printf(m, "%sname=%s", count ? "," : "", @@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk)  static void check_for_release(struct cgroup *cgrp)  { -	if (cgroup_is_releasable(cgrp) && -	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { +	if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && +	    !css_has_online_children(&cgrp->self)) {  		/*  		 * Control Group is currently removeable. If it's not  		 * already queued for a userspace notification, queue @@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str)  __setup("cgroup_disable=", cgroup_disable);  /** - * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry   * @dentry: directory dentry of interest   * @ss: subsystem of interest   * @@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable);   * to get the corresponding css and return it.  If such css doesn't exist   * or can't be pinned, an ERR_PTR value is returned.   */ -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, -						struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, +						       struct cgroup_subsys *ss)  {  	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);  	struct cgroup_subsys_state *css = NULL; @@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,  	/*  	 * This path doesn't originate from kernfs and @kn could already  	 * have been or be removed at any point.  @kn->priv is RCU -	 * protected for this access.  See destroy_locked() for details. +	 * protected for this access.  See cgroup_rmdir() for details.  	 */  	cgrp = rcu_dereference(kn->priv);  	if (cgrp)  		css = cgroup_css(cgrp, ss); -	if (!css || !css_tryget(css)) +	if (!css || !css_tryget_online(css))  		css = ERR_PTR(-ENOENT);  	rcu_read_unlock(); @@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,   */  struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)  { -	struct cgroup *cgrp; - -	cgroup_assert_mutexes_or_rcu_locked(); - -	cgrp = idr_find(&ss->root->cgroup_idr, id); -	if (cgrp) -		return cgroup_css(cgrp, ss); -	return NULL; +	WARN_ON_ONCE(!rcu_read_lock_held()); +	return idr_find(&ss->css_idr, id);  }  #ifdef CONFIG_CGROUP_DEBUG diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a2256444..a79e40f9d700 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -21,6 +21,7 @@  #include <linux/uaccess.h>  #include <linux/freezer.h>  #include <linux/seq_file.h> +#include <linux/mutex.h>  /*   * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is @@ -42,9 +43,10 @@ enum freezer_state_flags {  struct freezer {  	struct cgroup_subsys_state	css;  	unsigned int			state; -	spinlock_t			lock;  }; +static DEFINE_MUTEX(freezer_mutex); +  static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)  {  	return css ? container_of(css, struct freezer, css) : NULL; @@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)  static struct freezer *parent_freezer(struct freezer *freezer)  { -	return css_freezer(css_parent(&freezer->css)); +	return css_freezer(freezer->css.parent);  }  bool cgroup_freezing(struct task_struct *task) @@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)  	return ret;  } -/* - * cgroups_write_string() limits the size of freezer state strings to - * CGROUP_LOCAL_BUFFER_SIZE - */  static const char *freezer_state_strs(unsigned int state)  {  	if (state & CGROUP_FROZEN) @@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)  	if (!freezer)  		return ERR_PTR(-ENOMEM); -	spin_lock_init(&freezer->lock);  	return &freezer->css;  } @@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)  	struct freezer *freezer = css_freezer(css);  	struct freezer *parent = parent_freezer(freezer); -	/* -	 * The following double locking and freezing state inheritance -	 * guarantee that @cgroup can never escape ancestors' freezing -	 * states.  See css_for_each_descendant_pre() for details. -	 */ -	if (parent) -		spin_lock_irq(&parent->lock); -	spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); +	mutex_lock(&freezer_mutex);  	freezer->state |= CGROUP_FREEZER_ONLINE; @@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)  		atomic_inc(&system_freezing_cnt);  	} -	spin_unlock(&freezer->lock); -	if (parent) -		spin_unlock_irq(&parent->lock); - +	mutex_unlock(&freezer_mutex);  	return 0;  } @@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)  {  	struct freezer *freezer = css_freezer(css); -	spin_lock_irq(&freezer->lock); +	mutex_lock(&freezer_mutex);  	if (freezer->state & CGROUP_FREEZING)  		atomic_dec(&system_freezing_cnt);  	freezer->state = 0; -	spin_unlock_irq(&freezer->lock); +	mutex_unlock(&freezer_mutex);  }  static void freezer_css_free(struct cgroup_subsys_state *css) @@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,  	struct task_struct *task;  	bool clear_frozen = false; -	spin_lock_irq(&freezer->lock); +	mutex_lock(&freezer_mutex);  	/*  	 * Make the new tasks conform to the current state of @new_css. @@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,  		}  	} -	spin_unlock_irq(&freezer->lock); - -	/* -	 * Propagate FROZEN clearing upwards.  We may race with -	 * update_if_frozen(), but as long as both work bottom-up, either -	 * update_if_frozen() sees child's FROZEN cleared or we clear the -	 * parent's FROZEN later.  No parent w/ !FROZEN children can be -	 * left FROZEN. -	 */ +	/* propagate FROZEN clearing upwards */  	while (clear_frozen && (freezer = parent_freezer(freezer))) { -		spin_lock_irq(&freezer->lock);  		freezer->state &= ~CGROUP_FROZEN;  		clear_frozen = freezer->state & CGROUP_FREEZING; -		spin_unlock_irq(&freezer->lock);  	} + +	mutex_unlock(&freezer_mutex);  }  /** @@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task)  {  	struct freezer *freezer; -	rcu_read_lock(); -	freezer = task_freezer(task); -  	/*  	 * The root cgroup is non-freezable, so we can skip locking the  	 * freezer.  This is safe regardless of race with task migration. @@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task)  	 * to do.  If we lost and root is the new cgroup, noop is still the  	 * right thing to do.  	 */ -	if (!parent_freezer(freezer)) -		goto out; +	if (task_css_is_root(task, freezer_cgrp_id)) +		return; -	/* -	 * Grab @freezer->lock and freeze @task after verifying @task still -	 * belongs to @freezer and it's freezing.  The former is for the -	 * case where we have raced against task migration and lost and -	 * @task is already in a different cgroup which may not be frozen. -	 * This isn't strictly necessary as freeze_task() is allowed to be -	 * called spuriously but let's do it anyway for, if nothing else, -	 * documentation. -	 */ -	spin_lock_irq(&freezer->lock); -	if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) +	mutex_lock(&freezer_mutex); +	rcu_read_lock(); + +	freezer = task_freezer(task); +	if (freezer->state & CGROUP_FREEZING)  		freeze_task(task); -	spin_unlock_irq(&freezer->lock); -out: +  	rcu_read_unlock(); +	mutex_unlock(&freezer_mutex);  }  /** @@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)  	struct css_task_iter it;  	struct task_struct *task; -	WARN_ON_ONCE(!rcu_read_lock_held()); - -	spin_lock_irq(&freezer->lock); +	lockdep_assert_held(&freezer_mutex);  	if (!(freezer->state & CGROUP_FREEZING) ||  	    (freezer->state & CGROUP_FROZEN)) -		goto out_unlock; +		return;  	/* are all (live) children frozen? */ +	rcu_read_lock();  	css_for_each_child(pos, css) {  		struct freezer *child = css_freezer(pos);  		if ((child->state & CGROUP_FREEZER_ONLINE) && -		    !(child->state & CGROUP_FROZEN)) -			goto out_unlock; +		    !(child->state & CGROUP_FROZEN)) { +			rcu_read_unlock(); +			return; +		}  	} +	rcu_read_unlock();  	/* are all tasks frozen? */  	css_task_iter_start(css, &it); @@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)  	freezer->state |= CGROUP_FROZEN;  out_iter_end:  	css_task_iter_end(&it); -out_unlock: -	spin_unlock_irq(&freezer->lock);  }  static int freezer_read(struct seq_file *m, void *v)  {  	struct cgroup_subsys_state *css = seq_css(m), *pos; +	mutex_lock(&freezer_mutex);  	rcu_read_lock();  	/* update states bottom-up */ -	css_for_each_descendant_post(pos, css) +	css_for_each_descendant_post(pos, css) { +		if (!css_tryget_online(pos)) +			continue; +		rcu_read_unlock(); +  		update_if_frozen(pos); +		rcu_read_lock(); +		css_put(pos); +	} +  	rcu_read_unlock(); +	mutex_unlock(&freezer_mutex);  	seq_puts(m, freezer_state_strs(css_freezer(css)->state));  	seq_putc(m, '\n'); @@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,  				unsigned int state)  {  	/* also synchronizes against task migration, see freezer_attach() */ -	lockdep_assert_held(&freezer->lock); +	lockdep_assert_held(&freezer_mutex);  	if (!(freezer->state & CGROUP_FREEZER_ONLINE))  		return; @@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)  	 * descendant will try to inherit its parent's FREEZING state as  	 * CGROUP_FREEZING_PARENT.  	 */ +	mutex_lock(&freezer_mutex);  	rcu_read_lock();  	css_for_each_descendant_pre(pos, &freezer->css) {  		struct freezer *pos_f = css_freezer(pos);  		struct freezer *parent = parent_freezer(pos_f); -		spin_lock_irq(&pos_f->lock); +		if (!css_tryget_online(pos)) +			continue; +		rcu_read_unlock(); -		if (pos_f == freezer) { +		if (pos_f == freezer)  			freezer_apply_state(pos_f, freeze,  					    CGROUP_FREEZING_SELF); -		} else { -			/* -			 * Our update to @parent->state is already visible -			 * which is all we need.  No need to lock @parent. -			 * For more info on synchronization, see -			 * freezer_post_create(). -			 */ +		else  			freezer_apply_state(pos_f,  					    parent->state & CGROUP_FREEZING,  					    CGROUP_FREEZING_PARENT); -		} -		spin_unlock_irq(&pos_f->lock); +		rcu_read_lock(); +		css_put(pos);  	}  	rcu_read_unlock(); +	mutex_unlock(&freezer_mutex);  } -static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, -			 char *buffer) +static ssize_t freezer_write(struct kernfs_open_file *of, +			     char *buf, size_t nbytes, loff_t off)  {  	bool freeze; -	if (strcmp(buffer, freezer_state_strs(0)) == 0) +	buf = strstrip(buf); + +	if (strcmp(buf, freezer_state_strs(0)) == 0)  		freeze = false; -	else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) +	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)  		freeze = true;  	else  		return -EINVAL; -	freezer_change_state(css_freezer(css), freeze); -	return 0; +	freezer_change_state(css_freezer(of_css(of)), freeze); +	return nbytes;  }  static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, @@ -478,7 +458,7 @@ static struct cftype files[] = {  		.name = "state",  		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = freezer_read, -		.write_string = freezer_write, +		.write = freezer_write,  	},  	{  		.name = "self_freezing", diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp  int compat_get_timeval(struct timeval *tv, const void __user *utv)  {  	if (COMPAT_USE_64BIT_TIME) -		return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; +		return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;  	else  		return __compat_get_timeval(tv, utv);  } @@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval);  int compat_put_timeval(const struct timeval *tv, void __user *utv)  {  	if (COMPAT_USE_64BIT_TIME) -		return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; +		return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;  	else  		return __compat_put_timeval(tv, utv);  } @@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval);  int compat_get_timespec(struct timespec *ts, const void __user *uts)  {  	if (COMPAT_USE_64BIT_TIME) -		return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; +		return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;  	else  		return __compat_get_timespec(ts, uts);  } @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec);  int compat_put_timespec(const struct timespec *ts, void __user *uts)  {  	if (COMPAT_USE_64BIT_TIME) -		return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; +		return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;  	else  		return __compat_put_timespec(ts, uts);  } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6cb20d2e7ee0..019d45008448 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -120,7 +120,7 @@ void context_tracking_user_enter(void)   * instead of preempt_schedule() to exit user context if needed before   * calling the scheduler.   */ -asmlinkage void __sched notrace preempt_schedule_context(void) +asmlinkage __visible void __sched notrace preempt_schedule_context(void)  {  	enum ctx_state prev_ctx; diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..a343bde710b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@  #include <linux/gfp.h>  #include <linux/suspend.h>  #include <linux/lockdep.h> +#include <trace/events/power.h>  #include "smpboot.h" @@ -283,8 +284,7 @@ static inline void check_for_tasks(int cpu)  		task_cputime(p, &utime, &stime);  		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&  		    (utime || stime)) -			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " -				"(state = %ld, flags = %x)\n", +			pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",  				p->comm, task_pid_nr(p), cpu,  				p->state, p->flags);  	} @@ -336,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)  	if (err) {  		nr_calls--;  		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); -		printk("%s: attempt to take down CPU %u failed\n", -				__func__, cpu); +		pr_warn("%s: attempt to take down CPU %u failed\n", +			__func__, cpu);  		goto out_release;  	} @@ -444,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)  	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);  	if (ret) {  		nr_calls--; -		printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", -				__func__, cpu); +		pr_warn("%s: attempt to bring up CPU %u failed\n", +			__func__, cpu);  		goto out_notify;  	} @@ -475,11 +475,10 @@ int cpu_up(unsigned int cpu)  	int err = 0;  	if (!cpu_possible(cpu)) { -		printk(KERN_ERR "can't online cpu %d because it is not " -			"configured as may-hotadd at boot time\n", cpu); +		pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", +		       cpu);  #if defined(CONFIG_IA64) -		printk(KERN_ERR "please check additional_cpus= boot " -				"parameter\n"); +		pr_err("please check additional_cpus= boot parameter\n");  #endif  		return -EINVAL;  	} @@ -518,16 +517,17 @@ int disable_nonboot_cpus(void)  	 */  	cpumask_clear(frozen_cpus); -	printk("Disabling non-boot CPUs ...\n"); +	pr_info("Disabling non-boot CPUs ...\n");  	for_each_online_cpu(cpu) {  		if (cpu == first_cpu)  			continue; +		trace_suspend_resume(TPS("CPU_OFF"), cpu, true);  		error = _cpu_down(cpu, 1); +		trace_suspend_resume(TPS("CPU_OFF"), cpu, false);  		if (!error)  			cpumask_set_cpu(cpu, frozen_cpus);  		else { -			printk(KERN_ERR "Error taking CPU%d down: %d\n", -				cpu, error); +			pr_err("Error taking CPU%d down: %d\n", cpu, error);  			break;  		}  	} @@ -537,7 +537,7 @@ int disable_nonboot_cpus(void)  		/* Make sure the CPUs won't be enabled by someone else */  		cpu_hotplug_disabled = 1;  	} else { -		printk(KERN_ERR "Non-boot CPUs are not disabled\n"); +		pr_err("Non-boot CPUs are not disabled\n");  	}  	cpu_maps_update_done();  	return error; @@ -561,17 +561,19 @@ void __ref enable_nonboot_cpus(void)  	if (cpumask_empty(frozen_cpus))  		goto out; -	printk(KERN_INFO "Enabling non-boot CPUs ...\n"); +	pr_info("Enabling non-boot CPUs ...\n");  	arch_enable_nonboot_cpus_begin();  	for_each_cpu(cpu, frozen_cpus) { +		trace_suspend_resume(TPS("CPU_ON"), cpu, true);  		error = _cpu_up(cpu, 1); +		trace_suspend_resume(TPS("CPU_ON"), cpu, false);  		if (!error) { -			printk(KERN_INFO "CPU%d is up\n", cpu); +			pr_info("CPU%d is up\n", cpu);  			continue;  		} -		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); +		pr_warn("Error taking CPU%d up: %d\n", cpu, error);  	}  	arch_enable_nonboot_cpus_end(); @@ -726,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)  void set_cpu_online(unsigned int cpu, bool online)  { -	if (online) +	if (online) {  		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); -	else +		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); +	} else {  		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +	}  }  void set_cpu_active(unsigned int cpu, bool active) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..f6b33c696224 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@  #include <linux/cgroup.h>  #include <linux/wait.h> -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;  /* See "Frequency meter" comments, below. */ @@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)  static inline struct cpuset *parent_cs(struct cpuset *cs)  { -	return css_cs(css_parent(&cs->css)); +	return css_cs(cs->css.parent);  }  #ifdef CONFIG_NUMA @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,  		goto done;  	} -	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); +	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);  	if (!csa)  		goto done;  	csn = 0; @@ -696,11 +691,8 @@ restart:  		if (nslot == ndoms) {  			static int warnings = 10;  			if (warnings) { -				printk(KERN_WARNING -				 "rebuild_sched_domains confused:" -				  " nslot %d, ndoms %d, csn %d, i %d," -				  " apn %d\n", -				  nslot, ndoms, csn, i, apn); +				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", +					nslot, ndoms, csn, i, apn);  				warnings--;  			}  			continue; @@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)  				continue;  			}  		} -		if (!css_tryget(&cp->css)) +		if (!css_tryget_online(&cp->css))  			continue;  		rcu_read_unlock(); @@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)  /**   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it   * @cs: the cpuset to consider + * @trialcs: trial cpuset   * @buf: buffer of cpu numbers written to this cpuset   */  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, @@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)  				continue;  			}  		} -		if (!css_tryget(&cp->css)) +		if (!css_tryget_online(&cp->css))  			continue;  		rcu_read_unlock(); @@ -1605,13 +1598,15 @@ out_unlock:  /*   * Common handling for a write to a "cpus" or "mems" file.   */ -static int cpuset_write_resmask(struct cgroup_subsys_state *css, -				struct cftype *cft, char *buf) +static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +				    char *buf, size_t nbytes, loff_t off)  { -	struct cpuset *cs = css_cs(css); +	struct cpuset *cs = css_cs(of_css(of));  	struct cpuset *trialcs;  	int retval = -ENODEV; +	buf = strstrip(buf); +  	/*  	 * CPU or memory hotunplug may leave @cs w/o any execution  	 * resources, in which case the hotplug code asynchronously updates @@ -1635,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,  		goto out_unlock;  	} -	switch (cft->private) { +	switch (of_cft(of)->private) {  	case FILE_CPULIST:  		retval = update_cpumask(cs, trialcs, buf);  		break; @@ -1650,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,  	free_trial_cpuset(trialcs);  out_unlock:  	mutex_unlock(&cpuset_mutex); -	return retval; +	return retval ?: nbytes;  }  /* @@ -1752,7 +1747,7 @@ static struct cftype files[] = {  	{  		.name = "cpus",  		.seq_show = cpuset_common_seq_show, -		.write_string = cpuset_write_resmask, +		.write = cpuset_write_resmask,  		.max_write_len = (100U + 6 * NR_CPUS),  		.private = FILE_CPULIST,  	}, @@ -1760,7 +1755,7 @@ static struct cftype files[] = {  	{  		.name = "mems",  		.seq_show = cpuset_common_seq_show, -		.write_string = cpuset_write_resmask, +		.write = cpuset_write_resmask,  		.max_write_len = (100U + 6 * MAX_NUMNODES),  		.private = FILE_MEMLIST,  	}, @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  	if (is_spread_slab(parent))  		set_bit(CS_SPREAD_SLAB, &cs->flags); -	number_of_cpusets++; +	cpuset_inc();  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))  		goto out_unlock; @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)  	if (is_sched_load_balance(cs))  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); -	number_of_cpusets--; +	cpuset_dec();  	clear_bit(CS_ONLINE, &cs->flags);  	mutex_unlock(&cpuset_mutex); @@ -1992,7 +1987,6 @@ int __init cpuset_init(void)  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))  		BUG(); -	number_of_cpusets = 1;  	return 0;  } @@ -2017,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  		parent = parent_cs(parent);  	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { -		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); +		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");  		pr_cont_cgroup_name(cs->css.cgroup);  		pr_cont("\n");  	} @@ -2155,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  		rcu_read_lock();  		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { -			if (cs == &top_cpuset || !css_tryget(&cs->css)) +			if (cs == &top_cpuset || !css_tryget_online(&cs->css))  				continue;  			rcu_read_unlock(); @@ -2536,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,  /**   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @task: pointer to task_struct of some task. + * @tsk: pointer to task_struct of some task.   *   * Description: Prints @task's name, cpuset name, and cached copy of its   * mems_allowed to the kernel log. @@ -2554,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)  	cgrp = task_cs(tsk)->css.cgroup;  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,  			   tsk->mems_allowed); -	printk(KERN_INFO "%s cpuset=", tsk->comm); +	pr_info("%s cpuset=", tsk->comm);  	pr_cont_cgroup_name(cgrp);  	pr_cont(" mems_allowed=%s\n", cpuset_nodelist); @@ -2646,10 +2640,10 @@ out:  /* Display task mems_allowed in /proc/<pid>/status file. */  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)  { -	seq_printf(m, "Mems_allowed:\t"); +	seq_puts(m, "Mems_allowed:\t");  	seq_nodemask(m, &task->mems_allowed); -	seq_printf(m, "\n"); -	seq_printf(m, "Mems_allowed_list:\t"); +	seq_puts(m, "\n"); +	seq_puts(m, "Mems_allowed_list:\t");  	seq_nodemask_list(m, &task->mems_allowed); -	seq_printf(m, "\n"); +	seq_puts(m, "\n");  } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@  static void kdb_show_stack(struct task_struct *p, void *addr)  {  	int old_lvl = console_loglevel; -	console_loglevel = 15; +	console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;  	kdb_trap_printk++;  	kdb_set_current_task(p);  	if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit:  	}  	if (logging) {  		saved_loglevel = console_loglevel; -		console_loglevel = 0; +		console_loglevel = CONSOLE_LOGLEVEL_SILENT;  		printk(KERN_INFO "%s", kdb_buffer);  	} diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)  static void kdb_dumpregs(struct pt_regs *regs)  {  	int old_lvl = console_loglevel; -	console_loglevel = 15; +	console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;  	kdb_trap_printk++;  	show_regs(regs);  	kdb_trap_printk--; diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..24d35cc38e42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -39,6 +39,7 @@  #include <linux/hw_breakpoint.h>  #include <linux/mm_types.h>  #include <linux/cgroup.h> +#include <linux/module.h>  #include "internal.h" @@ -607,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  	if (!f.file)  		return -EBADF; -	css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); +	css = css_tryget_online_from_dir(f.file->f_dentry, +					 &perf_event_cgrp_subsys);  	if (IS_ERR(css)) {  		ret = PTR_ERR(css);  		goto out; @@ -1443,6 +1445,11 @@ group_sched_out(struct perf_event *group_event,  		cpuctx->exclusive = 0;  } +struct remove_event { +	struct perf_event *event; +	bool detach_group; +}; +  /*   * Cross CPU call to remove a performance event   * @@ -1451,12 +1458,15 @@ group_sched_out(struct perf_event *group_event,   */  static int __perf_remove_from_context(void *info)  { -	struct perf_event *event = info; +	struct remove_event *re = info; +	struct perf_event *event = re->event;  	struct perf_event_context *ctx = event->ctx;  	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);  	raw_spin_lock(&ctx->lock);  	event_sched_out(event, cpuctx, ctx); +	if (re->detach_group) +		perf_group_detach(event);  	list_del_event(event, ctx);  	if (!ctx->nr_events && cpuctx->task_ctx == ctx) {  		ctx->is_active = 0; @@ -1481,10 +1491,14 @@ static int __perf_remove_from_context(void *info)   * When called from perf_event_exit_task, it's OK because the   * context has been detached from its task.   */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group)  {  	struct perf_event_context *ctx = event->ctx;  	struct task_struct *task = ctx->task; +	struct remove_event re = { +		.event = event, +		.detach_group = detach_group, +	};  	lockdep_assert_held(&ctx->mutex); @@ -1493,12 +1507,12 @@ static void perf_remove_from_context(struct perf_event *event)  		 * Per cpu events are removed via an smp call and  		 * the removal is always successful.  		 */ -		cpu_function_call(event->cpu, __perf_remove_from_context, event); +		cpu_function_call(event->cpu, __perf_remove_from_context, &re);  		return;  	}  retry: -	if (!task_function_call(task, __perf_remove_from_context, event)) +	if (!task_function_call(task, __perf_remove_from_context, &re))  		return;  	raw_spin_lock_irq(&ctx->lock); @@ -1515,6 +1529,8 @@ retry:  	 * Since the task isn't running, its safe to remove the event, us  	 * holding the ctx->lock ensures the task won't get scheduled in.  	 */ +	if (detach_group) +		perf_group_detach(event);  	list_del_event(event, ctx);  	raw_spin_unlock_irq(&ctx->lock);  } @@ -1663,6 +1679,8 @@ event_sched_in(struct perf_event *event,  	u64 tstamp = perf_event_time(event);  	int ret = 0; +	lockdep_assert_held(&ctx->lock); +  	if (event->state <= PERF_EVENT_STATE_OFF)  		return 0; @@ -3178,7 +3196,8 @@ static void free_event_rcu(struct rcu_head *head)  }  static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, +			       struct ring_buffer *rb);  static void unaccount_event_cpu(struct perf_event *event, int cpu)  { @@ -3229,17 +3248,19 @@ static void __free_event(struct perf_event *event)  	if (event->ctx)  		put_ctx(event->ctx); +	if (event->pmu) +		module_put(event->pmu->module); +  	call_rcu(&event->rcu_head, free_event_rcu);  } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event)  {  	irq_work_sync(&event->pending);  	unaccount_event(event);  	if (event->rb) { -		struct ring_buffer *rb; -  		/*  		 * Can happen when we close an event with re-directed output.  		 * @@ -3247,57 +3268,38 @@ static void free_event(struct perf_event *event)  		 * over us; possibly making our ring_buffer_put() the last.  		 */  		mutex_lock(&event->mmap_mutex); -		rb = event->rb; -		if (rb) { -			rcu_assign_pointer(event->rb, NULL); -			ring_buffer_detach(event, rb); -			ring_buffer_put(rb); /* could be last */ -		} +		ring_buffer_attach(event, NULL);  		mutex_unlock(&event->mmap_mutex);  	}  	if (is_cgroup_event(event))  		perf_detach_cgroup(event); -  	__free_event(event);  } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event)  { -	struct perf_event_context *ctx = event->ctx; - -	WARN_ON_ONCE(ctx->parent_ctx); -	/* -	 * There are two ways this annotation is useful: -	 * -	 *  1) there is a lock recursion from perf_event_exit_task -	 *     see the comment there. -	 * -	 *  2) there is a lock-inversion with mmap_sem through -	 *     perf_event_read_group(), which takes faults while -	 *     holding ctx->mutex, however this is called after -	 *     the last filedesc died, so there is no possibility -	 *     to trigger the AB-BA case. -	 */ -	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); -	raw_spin_lock_irq(&ctx->lock); -	perf_group_detach(event); -	raw_spin_unlock_irq(&ctx->lock); -	perf_remove_from_context(event); -	mutex_unlock(&ctx->mutex); - -	free_event(event); +	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, +				"unexpected event refcount: %ld; ptr=%p\n", +				atomic_long_read(&event->refcount), event)) { +		/* leak to avoid use-after-free */ +		return; +	} -	return 0; +	_free_event(event);  } -EXPORT_SYMBOL_GPL(perf_event_release_kernel);  /*   * Called when the last reference to the file is gone.   */  static void put_event(struct perf_event *event)  { +	struct perf_event_context *ctx = event->ctx;  	struct task_struct *owner;  	if (!atomic_long_dec_and_test(&event->refcount)) @@ -3336,9 +3338,33 @@ static void put_event(struct perf_event *event)  		put_task_struct(owner);  	} -	perf_event_release_kernel(event); +	WARN_ON_ONCE(ctx->parent_ctx); +	/* +	 * There are two ways this annotation is useful: +	 * +	 *  1) there is a lock recursion from perf_event_exit_task +	 *     see the comment there. +	 * +	 *  2) there is a lock-inversion with mmap_sem through +	 *     perf_event_read_group(), which takes faults while +	 *     holding ctx->mutex, however this is called after +	 *     the last filedesc died, so there is no possibility +	 *     to trigger the AB-BA case. +	 */ +	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); +	perf_remove_from_context(event, true); +	mutex_unlock(&ctx->mutex); + +	_free_event(event);  } +int perf_event_release_kernel(struct perf_event *event) +{ +	put_event(event); +	return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); +  static int perf_release(struct inode *inode, struct file *file)  {  	put_event(file->private_data); @@ -3839,28 +3865,47 @@ unlock:  static void ring_buffer_attach(struct perf_event *event,  			       struct ring_buffer *rb)  { +	struct ring_buffer *old_rb = NULL;  	unsigned long flags; -	if (!list_empty(&event->rb_entry)) -		return; +	if (event->rb) { +		/* +		 * Should be impossible, we set this when removing +		 * event->rb_entry and wait/clear when adding event->rb_entry. +		 */ +		WARN_ON_ONCE(event->rcu_pending); -	spin_lock_irqsave(&rb->event_lock, flags); -	if (list_empty(&event->rb_entry)) -		list_add(&event->rb_entry, &rb->event_list); -	spin_unlock_irqrestore(&rb->event_lock, flags); -} +		old_rb = event->rb; +		event->rcu_batches = get_state_synchronize_rcu(); +		event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ -	unsigned long flags; +		spin_lock_irqsave(&old_rb->event_lock, flags); +		list_del_rcu(&event->rb_entry); +		spin_unlock_irqrestore(&old_rb->event_lock, flags); +	} -	if (list_empty(&event->rb_entry)) -		return; +	if (event->rcu_pending && rb) { +		cond_synchronize_rcu(event->rcu_batches); +		event->rcu_pending = 0; +	} + +	if (rb) { +		spin_lock_irqsave(&rb->event_lock, flags); +		list_add_rcu(&event->rb_entry, &rb->event_list); +		spin_unlock_irqrestore(&rb->event_lock, flags); +	} + +	rcu_assign_pointer(event->rb, rb); -	spin_lock_irqsave(&rb->event_lock, flags); -	list_del_init(&event->rb_entry); -	wake_up_all(&event->waitq); -	spin_unlock_irqrestore(&rb->event_lock, flags); +	if (old_rb) { +		ring_buffer_put(old_rb); +		/* +		 * Since we detached before setting the new rb, so that we +		 * could attach the new rb, we could have missed a wakeup. +		 * Provide it now. +		 */ +		wake_up_all(&event->waitq); +	}  }  static void ring_buffer_wakeup(struct perf_event *event) @@ -3929,7 +3974,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)  {  	struct perf_event *event = vma->vm_file->private_data; -	struct ring_buffer *rb = event->rb; +	struct ring_buffer *rb = ring_buffer_get(event);  	struct user_struct *mmap_user = rb->mmap_user;  	int mmap_locked = rb->mmap_locked;  	unsigned long size = perf_data_size(rb); @@ -3937,18 +3982,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)  	atomic_dec(&rb->mmap_count);  	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) -		return; +		goto out_put; -	/* Detach current event from the buffer. */ -	rcu_assign_pointer(event->rb, NULL); -	ring_buffer_detach(event, rb); +	ring_buffer_attach(event, NULL);  	mutex_unlock(&event->mmap_mutex);  	/* If there's still other mmap()s of this buffer, we're done. */ -	if (atomic_read(&rb->mmap_count)) { -		ring_buffer_put(rb); /* can't be last */ -		return; -	} +	if (atomic_read(&rb->mmap_count)) +		goto out_put;  	/*  	 * No other mmap()s, detach from all other events that might redirect @@ -3978,11 +4019,9 @@ again:  		 * still restart the iteration to make sure we're not now  		 * iterating the wrong list.  		 */ -		if (event->rb == rb) { -			rcu_assign_pointer(event->rb, NULL); -			ring_buffer_detach(event, rb); -			ring_buffer_put(rb); /* can't be last, we still have one */ -		} +		if (event->rb == rb) +			ring_buffer_attach(event, NULL); +  		mutex_unlock(&event->mmap_mutex);  		put_event(event); @@ -4007,6 +4046,7 @@ again:  	vma->vm_mm->pinned_vm -= mmap_locked;  	free_uid(mmap_user); +out_put:  	ring_buffer_put(rb); /* could be last */  } @@ -4124,7 +4164,6 @@ again:  	vma->vm_mm->pinned_vm += extra;  	ring_buffer_attach(event, rb); -	rcu_assign_pointer(event->rb, rb);  	perf_event_init_userpage(event);  	perf_event_update_userpage(event); @@ -5408,6 +5447,9 @@ struct swevent_htable {  	/* Recursion avoidance in each contexts */  	int				recursion[PERF_NR_CONTEXTS]; + +	/* Keeps track of cpu being initialized/exited */ +	bool				online;  };  static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5654,8 +5696,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)  	hwc->state = !(flags & PERF_EF_START);  	head = find_swevent_head(swhash, event); -	if (WARN_ON_ONCE(!head)) +	if (!head) { +		/* +		 * We can race with cpu hotplug code. Do not +		 * WARN if the cpu just got unplugged. +		 */ +		WARN_ON_ONCE(swhash->online);  		return -EINVAL; +	}  	hlist_add_head_rcu(&event->hlist_entry, head); @@ -6551,6 +6599,7 @@ free_pdc:  	free_percpu(pmu->pmu_disable_count);  	goto unlock;  } +EXPORT_SYMBOL_GPL(perf_pmu_register);  void perf_pmu_unregister(struct pmu *pmu)  { @@ -6572,6 +6621,7 @@ void perf_pmu_unregister(struct pmu *pmu)  	put_device(pmu->dev);  	free_pmu_context(pmu);  } +EXPORT_SYMBOL_GPL(perf_pmu_unregister);  struct pmu *perf_init_event(struct perf_event *event)  { @@ -6585,6 +6635,10 @@ struct pmu *perf_init_event(struct perf_event *event)  	pmu = idr_find(&pmu_idr, event->attr.type);  	rcu_read_unlock();  	if (pmu) { +		if (!try_module_get(pmu->module)) { +			pmu = ERR_PTR(-ENODEV); +			goto unlock; +		}  		event->pmu = pmu;  		ret = pmu->event_init(event);  		if (ret) @@ -6593,6 +6647,10 @@ struct pmu *perf_init_event(struct perf_event *event)  	}  	list_for_each_entry_rcu(pmu, &pmus, entry) { +		if (!try_module_get(pmu->module)) { +			pmu = ERR_PTR(-ENODEV); +			goto unlock; +		}  		event->pmu = pmu;  		ret = pmu->event_init(event);  		if (!ret) @@ -6771,6 +6829,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  err_pmu:  	if (event->destroy)  		event->destroy(event); +	module_put(pmu->module);  err_ns:  	if (event->ns)  		put_pid_ns(event->ns); @@ -6914,7 +6973,7 @@ err_size:  static int  perf_event_set_output(struct perf_event *event, struct perf_event *output_event)  { -	struct ring_buffer *rb = NULL, *old_rb = NULL; +	struct ring_buffer *rb = NULL;  	int ret = -EINVAL;  	if (!output_event) @@ -6942,8 +7001,6 @@ set:  	if (atomic_read(&event->mmap_count))  		goto unlock; -	old_rb = event->rb; -  	if (output_event) {  		/* get the rb we want to redirect to */  		rb = ring_buffer_get(output_event); @@ -6951,23 +7008,7 @@ set:  			goto unlock;  	} -	if (old_rb) -		ring_buffer_detach(event, old_rb); - -	if (rb) -		ring_buffer_attach(event, rb); - -	rcu_assign_pointer(event->rb, rb); - -	if (old_rb) { -		ring_buffer_put(old_rb); -		/* -		 * Since we detached before setting the new rb, so that we -		 * could attach the new rb, we could have missed a wakeup. -		 * Provide it now. -		 */ -		wake_up_all(&event->waitq); -	} +	ring_buffer_attach(event, rb);  	ret = 0;  unlock: @@ -7018,6 +7059,9 @@ SYSCALL_DEFINE5(perf_event_open,  	if (attr.freq) {  		if (attr.sample_freq > sysctl_perf_event_sample_rate)  			return -EINVAL; +	} else { +		if (attr.sample_period & (1ULL << 63)) +			return -EINVAL;  	}  	/* @@ -7055,20 +7099,26 @@ SYSCALL_DEFINE5(perf_event_open,  		}  	} +	if (task && group_leader && +	    group_leader->attr.inherit != attr.inherit) { +		err = -EINVAL; +		goto err_task; +	} +  	get_online_cpus();  	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,  				 NULL, NULL);  	if (IS_ERR(event)) {  		err = PTR_ERR(event); -		goto err_task; +		goto err_cpus;  	}  	if (flags & PERF_FLAG_PID_CGROUP) {  		err = perf_cgroup_connect(pid, event, &attr, group_leader);  		if (err) {  			__free_event(event); -			goto err_task; +			goto err_cpus;  		}  	} @@ -7165,7 +7215,7 @@ SYSCALL_DEFINE5(perf_event_open,  		struct perf_event_context *gctx = group_leader->ctx;  		mutex_lock(&gctx->mutex); -		perf_remove_from_context(group_leader); +		perf_remove_from_context(group_leader, false);  		/*  		 * Removing from the context ends up with disabled @@ -7175,7 +7225,7 @@ SYSCALL_DEFINE5(perf_event_open,  		perf_event__state_init(group_leader);  		list_for_each_entry(sibling, &group_leader->sibling_list,  				    group_entry) { -			perf_remove_from_context(sibling); +			perf_remove_from_context(sibling, false);  			perf_event__state_init(sibling);  			put_ctx(gctx);  		} @@ -7230,8 +7280,9 @@ err_context:  	put_ctx(ctx);  err_alloc:  	free_event(event); -err_task: +err_cpus:  	put_online_cpus(); +err_task:  	if (task)  		put_task_struct(task);  err_group_fd: @@ -7305,7 +7356,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)  	mutex_lock(&src_ctx->mutex);  	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,  				 event_entry) { -		perf_remove_from_context(event); +		perf_remove_from_context(event, false);  		unaccount_event_cpu(event, src_cpu);  		put_ctx(src_ctx);  		list_add(&event->migrate_entry, &events); @@ -7367,13 +7418,7 @@ __perf_event_exit_task(struct perf_event *child_event,  			 struct perf_event_context *child_ctx,  			 struct task_struct *child)  { -	if (child_event->parent) { -		raw_spin_lock_irq(&child_ctx->lock); -		perf_group_detach(child_event); -		raw_spin_unlock_irq(&child_ctx->lock); -	} - -	perf_remove_from_context(child_event); +	perf_remove_from_context(child_event, true);  	/*  	 * It can happen that the parent exits first, and has events @@ -7388,7 +7433,7 @@ __perf_event_exit_task(struct perf_event *child_event,  static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  { -	struct perf_event *child_event, *tmp; +	struct perf_event *child_event;  	struct perf_event_context *child_ctx;  	unsigned long flags; @@ -7442,24 +7487,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)  	 */  	mutex_lock(&child_ctx->mutex); -again: -	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, -				 group_entry) +	list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry)  		__perf_event_exit_task(child_event, child_ctx, child); -	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, -				 group_entry) -		__perf_event_exit_task(child_event, child_ctx, child); - -	/* -	 * If the last event was a group event, it will have appended all -	 * its siblings to the list, but we obtained 'tmp' before that which -	 * will still point to the list head terminating the iteration. -	 */ -	if (!list_empty(&child_ctx->pinned_groups) || -	    !list_empty(&child_ctx->flexible_groups)) -		goto again; -  	mutex_unlock(&child_ctx->mutex);  	put_ctx(child_ctx); @@ -7724,6 +7754,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)  	 * swapped under us.  	 */  	parent_ctx = perf_pin_task_context(parent, ctxn); +	if (!parent_ctx) +		return 0;  	/*  	 * No need to check if parent_ctx != NULL here; since we saw @@ -7835,6 +7867,7 @@ static void perf_event_init_cpu(int cpu)  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);  	mutex_lock(&swhash->hlist_mutex); +	swhash->online = true;  	if (swhash->hlist_refcount > 0) {  		struct swevent_hlist *hlist; @@ -7857,14 +7890,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)  static void __perf_event_exit_context(void *__info)  { +	struct remove_event re = { .detach_group = false };  	struct perf_event_context *ctx = __info; -	struct perf_event *event;  	perf_pmu_rotate_stop(ctx->pmu);  	rcu_read_lock(); -	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) -		__perf_remove_from_context(event); +	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) +		__perf_remove_from_context(&re);  	rcu_read_unlock();  } @@ -7892,6 +7925,7 @@ static void perf_event_exit_cpu(int cpu)  	perf_event_exit_cpu_context(cpu);  	mutex_lock(&swhash->hlist_mutex); +	swhash->online = false;  	swevent_hlist_release(swhash);  	mutex_unlock(&swhash->hlist_mutex);  } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..adcd76a96839 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;  /* Have a copy of original instruction */  #define UPROBE_COPY_INSN	0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP	1  struct uprobe {  	struct rb_node		rb_node;	/* node in the rb tree */ @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)  	uprobe->offset = offset;  	init_rwsem(&uprobe->register_rwsem);  	init_rwsem(&uprobe->consumer_rwsem); -	/* For now assume that the instruction need not be single-stepped */ -	__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);  	/* add to uprobes_tree, sorted on inode:offset */  	cur_uprobe = insert_uprobe(uprobe); -  	/* a uprobe exists for this inode:offset combination */  	if (cur_uprobe) {  		kfree(uprobe); @@ -1296,14 +1291,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)  	if (unlikely(!xol_vaddr))  		return 0; -	/* Initialize the slot */ -	copy_to_page(area->page, xol_vaddr, -			&uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); -	/* -	 * We probably need flush_icache_user_range() but it needs vma. -	 * This should work on supported architectures too. -	 */ -	flush_dcache_page(area->page); +	arch_uprobe_copy_ixol(area->page, xol_vaddr, +			      &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));  	return xol_vaddr;  } @@ -1346,6 +1335,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)  	}  } +void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, +				  void *src, unsigned long len) +{ +	/* Initialize the slot */ +	copy_to_page(page, vaddr, src, len); + +	/* +	 * We probably need flush_icache_user_range() but it needs vma. +	 * This should work on most of architectures by default. If +	 * architecture needs to do something different it can define +	 * its own version of the function. +	 */ +	flush_dcache_page(page); +} +  /**   * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs   * @regs: Reflects the saved state of the task after it has hit a breakpoint @@ -1628,20 +1632,6 @@ bool uprobe_deny_signal(void)  	return true;  } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ -	if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { -		if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) -			return true; -		clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); -	} -	return false; -} -  static void mmf_recalc_uprobes(struct mm_struct *mm)  {  	struct vm_area_struct *vma; @@ -1868,13 +1858,13 @@ static void handle_swbp(struct pt_regs *regs)  	handler_chain(uprobe, regs); -	if (can_skip_sstep(uprobe, regs)) +	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))  		goto out;  	if (!pre_ssout(uprobe, regs, bp_vaddr))  		return; -	/* can_skip_sstep() succeeded, or restart if can't singlestep */ +	/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */  out:  	put_uprobe(uprobe);  } @@ -1886,10 +1876,11 @@ out:  static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)  {  	struct uprobe *uprobe; +	int err = 0;  	uprobe = utask->active_uprobe;  	if (utask->state == UTASK_SSTEP_ACK) -		arch_uprobe_post_xol(&uprobe->arch, regs); +		err = arch_uprobe_post_xol(&uprobe->arch, regs);  	else if (utask->state == UTASK_SSTEP_TRAPPED)  		arch_uprobe_abort_xol(&uprobe->arch, regs);  	else @@ -1903,6 +1894,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)  	spin_lock_irq(¤t->sighand->siglock);  	recalc_sigpending(); /* see uprobe_deny_signal() */  	spin_unlock_irq(¤t->sighand->siglock); + +	if (unlikely(err)) { +		uprobe_warn(current, "execute the probed insn, sending SIGILL."); +		force_sig_info(SIGILL, SEND_SIG_FORCED, current); +	}  }  /* diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {  struct exec_domain default_exec_domain = {  	.name		= "Linux",		/* name */  	.handler	= default_handler,	/* lcall7 causes a seg fault. */ -	.pers_low	= 0, 			/* PER_LINUX personality. */ +	.pers_low	= 0,			/* PER_LINUX personality. */  	.pers_high	= 0,			/* PER_LINUX personality. */  	.signal_map	= ident_map,		/* Identity map signals. */  	.signal_invmap	= ident_map,		/*  - both ways. */ @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)  	ep = &default_exec_domain;  out:  	read_unlock(&exec_domains_lock); -	return (ep); +	return ep;  }  int @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)  out:  	write_unlock(&exec_domains_lock); -	return (err); +	return err;  } +EXPORT_SYMBOL(register_exec_domain);  int  unregister_exec_domain(struct exec_domain *ep) @@ -133,6 +134,7 @@ unregister:  	write_unlock(&exec_domains_lock);  	return 0;  } +EXPORT_SYMBOL(unregister_exec_domain);  int __set_personality(unsigned int personality)  { @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)  	return 0;  } +EXPORT_SYMBOL(__set_personality);  #ifdef CONFIG_PROC_FS  static int execdomains_proc_show(struct seq_file *m, void *v) @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)  	return old;  } - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..e5c4668f1799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)  	}  } -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ -	if (!valid_signal(sig) || sig < 1) -		return -EINVAL; - -	spin_lock_irq(¤t->sighand->siglock); -	/* This is only needed for daemonize()'ed kthreads */ -	sigdelset(¤t->blocked, sig); -	/* -	 * Kernel threads handle their own signals. Let the signal code -	 * know it'll be handled, so that they don't get converted to -	 * SIGKILL or just silently dropped. -	 */ -	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; -	recalc_sigpending(); -	spin_unlock_irq(¤t->sighand->siglock); -	return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ -	if (!valid_signal(sig) || sig < 1) -		return -EINVAL; - -	spin_lock_irq(¤t->sighand->siglock); -	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; -	recalc_sigpending(); -	spin_unlock_irq(¤t->sighand->siglock); -	return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG  /*   * A task is exiting.   If it owned this mm, find a new owner for the mm.   */ @@ -395,14 +356,18 @@ retry:  	}  	/* -	 * Search through everything else. We should not get -	 * here often +	 * Search through everything else, we should not get here often.  	 */ -	do_each_thread(g, c) { -		if (c->mm == mm) -			goto assign_new_owner; -	} while_each_thread(g, c); - +	for_each_process(g) { +		if (g->flags & PF_KTHREAD) +			continue; +		for_each_thread(g, c) { +			if (c->mm == mm) +				goto assign_new_owner; +			if (c->mm) +				break; +		} +	}  	read_unlock(&tasklist_lock);  	/*  	 * We found no owner yet mm_users > 1: this implies that we are @@ -434,7 +399,7 @@ assign_new_owner:  	task_unlock(c);  	put_task_struct(c);  } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */  /*   * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)  static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,  						  int node)  { -	struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, -					     THREAD_SIZE_ORDER); +	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, +						  THREAD_SIZE_ORDER);  	return page ? page_address(page) : NULL;  }  static inline void free_thread_info(struct thread_info *ti)  { -	free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); +	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);  }  # else  static struct kmem_cache *thread_info_cache; @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)  #endif  } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG  void mm_init_owner(struct mm_struct *mm, struct task_struct *p)  {  	mm->owner = p;  } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */  /*   * Initialize POSIX timer handling for a single task. @@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags,  	 */  	if (!IS_ERR(p)) {  		struct completion vfork; +		struct pid *pid;  		trace_sched_process_fork(current, p); -		nr = task_pid_vnr(p); +		pid = get_task_pid(p, PIDTYPE_PID); +		nr = pid_vnr(pid);  		if (clone_flags & CLONE_PARENT_SETTID)  			put_user(nr, parent_tidptr); @@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags,  		/* forking complete and child started to run, tell ptracer */  		if (unlikely(trace)) -			ptrace_event(trace, nr); +			ptrace_event_pid(trace, pid);  		if (clone_flags & CLONE_VFORK) {  			if (!wait_for_vfork_done(p, &vfork)) -				ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); +				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);  		} + +		put_pid(pid);  	} else {  		nr = PTR_ERR(p);  	} diff --git a/kernel/futex.c b/kernel/futex.c index b991ec05b8f9..b632b5f3f094 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr)  	raw_spin_unlock_irq(&curr->pi_lock);  } +/* + * We need to check the following states: + * + *      Waiter | pi_state | pi->owner | uTID      | uODIED | ? + * + * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid + * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid + * + * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid + * + * [4]  Found  | Found    | NULL      | 0         | 1      | Valid + * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid + * + * [6]  Found  | Found    | task      | 0         | 1      | Valid + * + * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid + * + * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid + * [9]  Found  | Found    | task      | 0         | 0      | Invalid + * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid + * + * [1]	Indicates that the kernel can acquire the futex atomically. We + *	came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2]	Valid, if TID does not belong to a kernel thread. If no matching + *      thread is found then it indicates that the owner TID has died. + * + * [3]	Invalid. The waiter is queued on a non PI futex + * + * [4]	Valid state after exit_robust_list(), which sets the user space + *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5]	The user space value got manipulated between exit_robust_list() + *	and exit_pi_state_list() + * + * [6]	Valid state after exit_pi_state_list() which sets the new owner in + *	the pi_state but cannot access the user space value. + * + * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8]	Owner and user space value match + * + * [9]	There is no transient state which sets the user space TID to 0 + *	except exit_robust_list(), but this is indicated by the + *	FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + *	TID out of sync. + */  static int  lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  		union futex_key *key, struct futex_pi_state **ps) @@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (match_futex(&this->key, key)) {  			/* -			 * Another waiter already exists - bump up -			 * the refcount and return its pi_state: +			 * Sanity check the waiter before increasing +			 * the refcount and attaching to it.  			 */  			pi_state = this->pi_state;  			/* -			 * Userspace might have messed up non-PI and PI futexes +			 * Userspace might have messed up non-PI and +			 * PI futexes [3]  			 */  			if (unlikely(!pi_state))  				return -EINVAL; @@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  			WARN_ON(!atomic_read(&pi_state->refcount));  			/* -			 * When pi_state->owner is NULL then the owner died -			 * and another waiter is on the fly. pi_state->owner -			 * is fixed up by the task which acquires -			 * pi_state->rt_mutex. -			 * -			 * We do not check for pid == 0 which can happen when -			 * the owner died and robust_list_exit() cleared the -			 * TID. +			 * Handle the owner died case:  			 */ -			if (pid && pi_state->owner) { +			if (uval & FUTEX_OWNER_DIED) { +				/* +				 * exit_pi_state_list sets owner to NULL and +				 * wakes the topmost waiter. The task which +				 * acquires the pi_state->rt_mutex will fixup +				 * owner. +				 */ +				if (!pi_state->owner) { +					/* +					 * No pi state owner, but the user +					 * space TID is not 0. Inconsistent +					 * state. [5] +					 */ +					if (pid) +						return -EINVAL; +					/* +					 * Take a ref on the state and +					 * return. [4] +					 */ +					goto out_state; +				} +  				/* -				 * Bail out if user space manipulated the -				 * futex value. +				 * If TID is 0, then either the dying owner +				 * has not yet executed exit_pi_state_list() +				 * or some waiter acquired the rtmutex in the +				 * pi state, but did not yet fixup the TID in +				 * user space. +				 * +				 * Take a ref on the state and return. [6]  				 */ -				if (pid != task_pid_vnr(pi_state->owner)) +				if (!pid) +					goto out_state; +			} else { +				/* +				 * If the owner died bit is not set, +				 * then the pi_state must have an +				 * owner. [7] +				 */ +				if (!pi_state->owner)  					return -EINVAL;  			} +			/* +			 * Bail out if user space manipulated the +			 * futex value. If pi state exists then the +			 * owner TID must be the same as the user +			 * space TID. [9/10] +			 */ +			if (pid != task_pid_vnr(pi_state->owner)) +				return -EINVAL; + +		out_state:  			atomic_inc(&pi_state->refcount);  			*ps = pi_state; -  			return 0;  		}  	}  	/*  	 * We are the first waiter - try to look up the real owner and attach -	 * the new pi_state to it, but bail out when TID = 0 +	 * the new pi_state to it, but bail out when TID = 0 [1]  	 */  	if (!pid)  		return -ESRCH; @@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  	if (!p)  		return -ESRCH; +	if (!p->mm) { +		put_task_struct(p); +		return -EPERM; +	} +  	/*  	 * We need to look at the task state flags to figure out,  	 * whether the task is exiting. To protect against the do_exit @@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  		return ret;  	} +	/* +	 * No existing pi state. First waiter. [2] +	 */  	pi_state = alloc_pi_state();  	/* @@ -894,10 +988,18 @@ retry:  		return -EDEADLK;  	/* -	 * Surprise - we got the lock. Just return to userspace: +	 * Surprise - we got the lock, but we do not trust user space at all.  	 */ -	if (unlikely(!curval)) -		return 1; +	if (unlikely(!curval)) { +		/* +		 * We verify whether there is kernel state for this +		 * futex. If not, we can safely assume, that the 0 -> +		 * TID transition is correct. If state exists, we do +		 * not bother to fixup the user space state as it was +		 * corrupted already. +		 */ +		return futex_top_waiter(hb, key) ? -EINVAL : 1; +	}  	uval = curval; @@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)  	struct task_struct *new_owner;  	struct futex_pi_state *pi_state = this->pi_state;  	u32 uninitialized_var(curval), newval; +	int ret = 0;  	if (!pi_state)  		return -EINVAL; @@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)  		new_owner = this->task;  	/* -	 * We pass it to the next owner. (The WAITERS bit is always -	 * kept enabled while there is PI state around. We must also -	 * preserve the owner died bit.) +	 * We pass it to the next owner. The WAITERS bit is always +	 * kept enabled while there is PI state around. We cleanup the +	 * owner died bit, because we are the owner.  	 */ -	if (!(uval & FUTEX_OWNER_DIED)) { -		int ret = 0; +	newval = FUTEX_WAITERS | task_pid_vnr(new_owner); -		newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - -		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) -			ret = -EFAULT; -		else if (curval != uval) -			ret = -EINVAL; -		if (ret) { -			raw_spin_unlock(&pi_state->pi_mutex.wait_lock); -			return ret; -		} +	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) +		ret = -EFAULT; +	else if (curval != uval) +		ret = -EINVAL; +	if (ret) { +		raw_spin_unlock(&pi_state->pi_mutex.wait_lock); +		return ret;  	}  	raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,   *   * Return:   *  0 - failed to acquire the lock atomically; - *  1 - acquired the lock; + * >0 - acquired the lock, return value is vpid of the top_waiter   * <0 - error   */  static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,  {  	struct futex_q *top_waiter = NULL;  	u32 curval; -	int ret; +	int ret, vpid;  	if (get_futex_value_locked(&curval, pifutex))  		return -EFAULT; @@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,  	 * the contended case or if set_waiters is 1.  The pi_state is returned  	 * in ps in contended cases.  	 */ +	vpid = task_pid_vnr(top_waiter->task);  	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,  				   set_waiters); -	if (ret == 1) +	if (ret == 1) {  		requeue_pi_wake_futex(top_waiter, key2, hb2); - +		return vpid; +	}  	return ret;  } @@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,  	struct futex_pi_state *pi_state = NULL;  	struct futex_hash_bucket *hb1, *hb2;  	struct futex_q *this, *next; -	u32 curval2;  	if (requeue_pi) {  		/* +		 * Requeue PI only works on two distinct uaddrs. This +		 * check is only valid for private futexes. See below. +		 */ +		if (uaddr1 == uaddr2) +			return -EINVAL; + +		/*  		 * requeue_pi requires a pi_state, try to allocate it now  		 * without any locks in case it fails.  		 */ @@ -1462,6 +1569,15 @@ retry:  	if (unlikely(ret != 0))  		goto out_put_key1; +	/* +	 * The check above which compares uaddrs is not sufficient for +	 * shared futexes. We need to compare the keys: +	 */ +	if (requeue_pi && match_futex(&key1, &key2)) { +		ret = -EINVAL; +		goto out_put_keys; +	} +  	hb1 = hash_futex(&key1);  	hb2 = hash_futex(&key2); @@ -1509,16 +1625,25 @@ retry_private:  		 * At this point the top_waiter has either taken uaddr2 or is  		 * waiting on it.  If the former, then the pi_state will not  		 * exist yet, look it up one more time to ensure we have a -		 * reference to it. +		 * reference to it. If the lock was taken, ret contains the +		 * vpid of the top waiter task.  		 */ -		if (ret == 1) { +		if (ret > 0) {  			WARN_ON(pi_state);  			drop_count++;  			task_count++; -			ret = get_futex_value_locked(&curval2, uaddr2); -			if (!ret) -				ret = lookup_pi_state(curval2, hb2, &key2, -						      &pi_state); +			/* +			 * If we acquired the lock, then the user +			 * space value of uaddr2 should be vpid. It +			 * cannot be changed by the top waiter as it +			 * is blocked on hb2 lock if it tries to do +			 * so. If something fiddled with it behind our +			 * back the pi state lookup might unearth +			 * it. So we rather use the known value than +			 * rereading and handing potential crap to +			 * lookup_pi_state. +			 */ +			ret = lookup_pi_state(ret, hb2, &key2, &pi_state);  		}  		switch (ret) { @@ -2301,9 +2426,10 @@ retry:  	/*  	 * To avoid races, try to do the TID -> 0 atomic transition  	 * again. If it succeeds then we can return without waking -	 * anyone else up: +	 * anyone else up. We only try this if neither the waiters nor +	 * the owner died bit are set.  	 */ -	if (!(uval & FUTEX_OWNER_DIED) && +	if (!(uval & ~FUTEX_TID_MASK) &&  	    cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))  		goto pi_faulted;  	/* @@ -2333,11 +2459,9 @@ retry:  	/*  	 * No waiters - kernel unlocks the futex:  	 */ -	if (!(uval & FUTEX_OWNER_DIED)) { -		ret = unlock_futex_pi(uaddr, uval); -		if (ret == -EFAULT) -			goto pi_faulted; -	} +	ret = unlock_futex_pi(uaddr, uval); +	if (ret == -EFAULT) +		goto pi_faulted;  out_unlock:  	spin_unlock(&hb->lock); @@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  	if (ret)  		goto out_key2; +	/* +	 * The check above which compares uaddrs is not sufficient for +	 * shared futexes. We need to compare the keys: +	 */ +	if (match_futex(&q.key, &key2)) { +		ret = -EINVAL; +		goto out_put_keys; +	} +  	/* Queue the futex_q, drop the hb lock, wait for wakeup. */  	futex_wait_queue_me(hb, &q, to); diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c0..b358a802fd18 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)  }  EXPORT_SYMBOL(__gcov_merge_ior); +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); +  /**   * gcov_enable_events - enable event reporting through gcov_event()   * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c814..826ba9fb5e32 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,12 @@  #include <linux/vmalloc.h>  #include "gcov.h" +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#define GCOV_COUNTERS			9 +#else  #define GCOV_COUNTERS			8 +#endif +  #define GCOV_TAG_FUNCTION_LENGTH	3  static struct gcov_info *gcov_info_head; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d55092ceee29..3ab28993f6e0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -234,6 +234,11 @@ again:  			goto again;  		}  		timer->base = new_base; +	} else { +		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { +			cpu = this_cpu; +			goto again; +		}  	}  	return new_base;  } @@ -569,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)  	cpu_base->expires_next.tv64 = expires_next.tv64; +	/* +	 * If a hang was detected in the last timer interrupt then we +	 * leave the hang delay active in the hardware. We want the +	 * system to make progress. That also prevents the following +	 * scenario: +	 * T1 expires 50ms from now +	 * T2 expires 5s from now +	 * +	 * T1 is removed, so this code is called and would reprogram +	 * the hardware to 5s from now. Any hrtimer_start after that +	 * will not reprogram the hardware due to hang_detected being +	 * set. So we'd effectivly block all timers until the T2 event +	 * fires. +	 */ +	if (cpu_base->hang_detected) +		return; +  	if (cpu_base->expires_next.tv64 != KTIME_MAX)  		tick_program_event(cpu_base->expires_next, 1);  } @@ -968,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,  	/* Remove an active timer from the queue: */  	ret = remove_hrtimer(timer, base); -	/* Switch the timer base, if necessary: */ -	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); -  	if (mode & HRTIMER_MODE_REL) { -		tim = ktime_add_safe(tim, new_base->get_time()); +		tim = ktime_add_safe(tim, base->get_time());  		/*  		 * CONFIG_TIME_LOW_RES is a temporary way for architectures  		 * to signal that they simply return xtime in @@ -987,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,  	hrtimer_set_expires_range_ns(timer, tim, delta_ns); +	/* Switch the timer base, if necessary: */ +	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); +  	timer_stats_hrtimer_set_start_info(timer);  	leftmost = enqueue_hrtimer(timer, new_base); @@ -1017,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,  	return ret;  } +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);  /**   * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =  static int __init hung_task_panic_setup(char *str)  { -	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); +	int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); +	if (rc) +		return rc;  	return 1;  }  __setup("hung_task_panic=", hung_task_panic_setup); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 07cbdfea9ae2..d269cecdfbf0 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -5,6 +5,10 @@ menu "IRQ subsystem"  config MAY_HAVE_SPARSE_IRQ         bool +# Legacy support, required for itanic +config GENERIC_IRQ_LEGACY +       bool +  # Enable the generic irq autoprobe mechanism  config GENERIC_IRQ_PROBE  	bool @@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW  config GENERIC_IRQ_SHOW_LEVEL         bool +# Facility to allocate a hardware interrupt. This is legacy support +# and should not be used in new code. Use irq domains instead. +config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ +       bool +  # Support for delayed migration from interrupt context  config GENERIC_PENDING_IRQ  	bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6397df2d6945..a2b28a2fd7b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)  	irq_put_desc_unlock(desc, flags);  	/*  	 * For !CONFIG_SPARSE_IRQ make the irq show up in -	 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is -	 * already marked, and this call is harmless. +	 * allocated_irqs.  	 */ -	irq_reserve_irq(irq); +	irq_mark_irq(irq);  	return 0;  }  EXPORT_SYMBOL(irq_set_chip); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ddf1ffeb79f1..099ea2e0eb88 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -33,7 +33,7 @@ enum {  };  /* - * Bit masks for desc->state + * Bit masks for desc->core_internal_state__do_not_mess_with_it   *   * IRQS_AUTODETECT		- autodetection in progress   * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt @@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc);  extern void unmask_irq(struct irq_desc *desc);  extern void unmask_threaded_irq(struct irq_desc *desc); +#ifdef CONFIG_SPARSE_IRQ +static inline void irq_mark_irq(unsigned int irq) { } +#else +extern void irq_mark_irq(unsigned int irq); +#endif +  extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a7174617616b..7339e42a85ab 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc);  static void free_desc(unsigned int irq)  { -	dynamic_irq_cleanup(irq); +	struct irq_desc *desc = irq_to_desc(irq); +	unsigned long flags; + +	raw_spin_lock_irqsave(&desc->lock, flags); +	desc_set_defaults(irq, desc, desc_node(desc), NULL); +	raw_spin_unlock_irqrestore(&desc->lock, flags);  }  static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)  	return -ENOMEM;  } +void irq_mark_irq(unsigned int irq) +{ +	mutex_lock(&sparse_irq_lock); +	bitmap_set(allocated_irqs, irq, 1); +	mutex_unlock(&sparse_irq_lock); +} + +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq) +{ +	free_desc(irq); +} +#endif +  #endif /* !CONFIG_SPARSE_IRQ */  /** @@ -363,6 +382,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,  		if (from > irq)  			return -EINVAL;  		from = irq; +	} else { +		/* +		 * For interrupts which are freely allocated the +		 * architecture can force a lower bound to the @from +		 * argument. x86 uses this to exclude the GSI space. +		 */ +		from = arch_dynirq_lower_bound(from);  	}  	mutex_lock(&sparse_irq_lock); @@ -389,30 +415,56 @@ err:  }  EXPORT_SYMBOL_GPL(__irq_alloc_descs); +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ  /** - * irq_reserve_irqs - mark irqs allocated - * @from:	mark from irq number - * @cnt:	number of irqs to mark + * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware + * @cnt:	number of interrupts to allocate + * @node:	node on which to allocate   * - * Returns 0 on success or an appropriate error code + * Returns an interrupt number > 0 or 0, if the allocation fails.   */ -int irq_reserve_irqs(unsigned int from, unsigned int cnt) +unsigned int irq_alloc_hwirqs(int cnt, int node)  { -	unsigned int start; -	int ret = 0; +	int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); -	if (!cnt || (from + cnt) > nr_irqs) -		return -EINVAL; +	if (irq < 0) +		return 0; -	mutex_lock(&sparse_irq_lock); -	start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); -	if (start == from) -		bitmap_set(allocated_irqs, start, cnt); -	else -		ret = -EEXIST; -	mutex_unlock(&sparse_irq_lock); -	return ret; +	for (i = irq; cnt > 0; i++, cnt--) { +		if (arch_setup_hwirq(i, node)) +			goto err; +		irq_clear_status_flags(i, _IRQ_NOREQUEST); +	} +	return irq; + +err: +	for (i--; i >= irq; i--) { +		irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); +		arch_teardown_hwirq(i); +	} +	irq_free_descs(irq, cnt); +	return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); + +/** + * irq_free_hwirqs - Free irq descriptor and cleanup the hardware + * @from:	Free from irq number + * @cnt:	number of interrupts to free + * + */ +void irq_free_hwirqs(unsigned int from, int cnt) +{ +	int i; + +	for (i = from; cnt > 0; i++, cnt--) { +		irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); +		arch_teardown_hwirq(i); +	} +	irq_free_descs(from, cnt);  } +EXPORT_SYMBOL_GPL(irq_free_hwirqs); +#endif  /**   * irq_get_next_irq - get next allocated irq number @@ -475,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq)  	return 0;  } -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq:	irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ -	struct irq_desc *desc = irq_to_desc(irq); -	unsigned long flags; - -	raw_spin_lock_irqsave(&desc->lock, flags); -	desc_set_defaults(irq, desc, desc_node(desc), NULL); -	raw_spin_unlock_irqrestore(&desc->lock, flags); -} -  void kstat_incr_irq_this_cpu(unsigned int irq)  {  	kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f14033700c25..eb5e10e32e05 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain;   * __irq_domain_add() - Allocate a new irq_domain data structure   * @of_node: optional device-tree node of the interrupt controller   * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller   * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no   *              direct mapping   * @ops: map/unmap domain callbacks   * @host_data: Controller private data pointer   * - * Allocates and initialize and irq_domain structure.  Caller is expected to - * register allocated irq_domain with irq_domain_register().  Returns pointer - * to IRQ domain, or NULL on failure. + * Allocates and initialize and irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure.   */  struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,  				    irq_hw_number_t hwirq_max, int direct_max, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2486a4c1a710..3dc6a61bf06a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,  	struct irq_chip *chip = irq_data_get_irq_chip(data);  	int ret; -	ret = chip->irq_set_affinity(data, mask, false); +	ret = chip->irq_set_affinity(data, mask, force);  	switch (ret) {  	case IRQ_SET_MASK_OK:  		cpumask_copy(data->affinity, mask); @@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,  	return ret;  } -int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) +int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, +			    bool force)  {  	struct irq_chip *chip = irq_data_get_irq_chip(data);  	struct irq_desc *desc = irq_data_to_desc(data); @@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)  		return -EINVAL;  	if (irq_can_move_pcntxt(data)) { -		ret = irq_do_set_affinity(data, mask, false); +		ret = irq_do_set_affinity(data, mask, force);  	} else {  		irqd_set_move_pending(data);  		irq_copy_pending(desc, mask); @@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)  	return ret;  } -/** - *	irq_set_affinity - Set the irq affinity of a given irq - *	@irq:		Interrupt to set affinity - *	@mask:		cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)  {  	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; @@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)  		return -EINVAL;  	raw_spin_lock_irqsave(&desc->lock, flags); -	ret =  __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); +	ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);  	raw_spin_unlock_irqrestore(&desc->lock, flags);  	return ret;  } @@ -891,8 +886,8 @@ static int irq_thread(void *data)  		irq_thread_check_affinity(desc, action);  		action_ret = handler_fn(desc, action); -		if (!noirqdebug) -			note_interrupt(action->irq, desc, action_ret); +		if (action_ret == IRQ_HANDLED) +			atomic_inc(&desc->threads_handled);  		wake_threads_waitq(desc);  	} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56e..e2514b0e439e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,  	return action && (action->flags & IRQF_IRQPOLL);  } +#define SPURIOUS_DEFERRED	0x80000000 +  void note_interrupt(unsigned int irq, struct irq_desc *desc,  		    irqreturn_t action_ret)  { @@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,  	    irq_settings_is_polled(desc))  		return; -	/* we get here again via the threaded handler */ -	if (action_ret == IRQ_WAKE_THREAD) -		return; -  	if (bad_action_ret(action_ret)) {  		report_bad_irq(irq, desc, action_ret);  		return;  	} +	/* +	 * We cannot call note_interrupt from the threaded handler +	 * because we need to look at the compound of all handlers +	 * (primary and threaded). Aside of that in the threaded +	 * shared case we have no serialization against an incoming +	 * hardware interrupt while we are dealing with a threaded +	 * result. +	 * +	 * So in case a thread is woken, we just note the fact and +	 * defer the analysis to the next hardware interrupt. +	 * +	 * The threaded handlers store whether they sucessfully +	 * handled an interrupt and we check whether that number +	 * changed versus the last invocation. +	 * +	 * We could handle all interrupts with the delayed by one +	 * mechanism, but for the non forced threaded case we'd just +	 * add pointless overhead to the straight hardirq interrupts +	 * for the sake of a few lines less code. +	 */ +	if (action_ret & IRQ_WAKE_THREAD) { +		/* +		 * There is a thread woken. Check whether one of the +		 * shared primary handlers returned IRQ_HANDLED. If +		 * not we defer the spurious detection to the next +		 * interrupt. +		 */ +		if (action_ret == IRQ_WAKE_THREAD) { +			int handled; +			/* +			 * We use bit 31 of thread_handled_last to +			 * denote the deferred spurious detection +			 * active. No locking necessary as +			 * thread_handled_last is only accessed here +			 * and we have the guarantee that hard +			 * interrupts are not reentrant. +			 */ +			if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { +				desc->threads_handled_last |= SPURIOUS_DEFERRED; +				return; +			} +			/* +			 * Check whether one of the threaded handlers +			 * returned IRQ_HANDLED since the last +			 * interrupt happened. +			 * +			 * For simplicity we just set bit 31, as it is +			 * set in threads_handled_last as well. So we +			 * avoid extra masking. And we really do not +			 * care about the high bits of the handled +			 * count. We just care about the count being +			 * different than the one we saw before. +			 */ +			handled = atomic_read(&desc->threads_handled); +			handled |= SPURIOUS_DEFERRED; +			if (handled != desc->threads_handled_last) { +				action_ret = IRQ_HANDLED; +				/* +				 * Note: We keep the SPURIOUS_DEFERRED +				 * bit set. We are handling the +				 * previous invocation right now. +				 * Keep it for the current one, so the +				 * next hardware interrupt will +				 * account for it. +				 */ +				desc->threads_handled_last = handled; +			} else { +				/* +				 * None of the threaded handlers felt +				 * responsible for the last interrupt +				 * +				 * We keep the SPURIOUS_DEFERRED bit +				 * set in threads_handled_last as we +				 * need to account for the current +				 * interrupt as well. +				 */ +				action_ret = IRQ_NONE; +			} +		} else { +			/* +			 * One of the primary handlers returned +			 * IRQ_HANDLED. So we don't care about the +			 * threaded handlers on the same line. Clear +			 * the deferred detection bit. +			 * +			 * In theory we could/should check whether the +			 * deferred bit is set and take the result of +			 * the previous run into account here as +			 * well. But it's really not worth the +			 * trouble. If every other interrupt is +			 * handled we never trigger the spurious +			 * detector. And if this is just the one out +			 * of 100k unhandled ones which is handled +			 * then we merily delay the spurious detection +			 * by one hard interrupt. Not a real problem. +			 */ +			desc->threads_handled_last &= ~SPURIOUS_DEFERRED; +		} +	} +  	if (unlikely(action_ret == IRQ_NONE)) {  		/*  		 * If we are seeing only the odd spurious IRQ caused by diff --git a/kernel/kexec.c b/kernel/kexec.c index c8380ad203bc..6748688813d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image,  				       unsigned long dest);  static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, -	                    unsigned long nr_segments, -                            struct kexec_segment __user *segments) +			   unsigned long nr_segments, +			   struct kexec_segment __user *segments)  {  	size_t segment_bytes;  	struct kimage *image; @@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,  	image->control_code_page = kimage_alloc_control_pages(image,  					   get_order(KEXEC_CONTROL_PAGE_SIZE));  	if (!image->control_code_page) { -		printk(KERN_ERR "Could not allocate control_code_buffer\n"); +		pr_err("Could not allocate control_code_buffer\n");  		goto out_free;  	}  	image->swap_page = kimage_alloc_control_pages(image, 0);  	if (!image->swap_page) { -		printk(KERN_ERR "Could not allocate swap buffer\n"); +		pr_err("Could not allocate swap buffer\n");  		goto out_free;  	} @@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,  	image->control_code_page = kimage_alloc_control_pages(image,  					   get_order(KEXEC_CONTROL_PAGE_SIZE));  	if (!image->control_code_page) { -		printk(KERN_ERR "Could not allocate control_code_buffer\n"); +		pr_err("Could not allocate control_code_buffer\n");  		goto out_free;  	} @@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image)  #define for_each_kimage_entry(image, ptr, entry) \  	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ -		ptr = (entry & IND_INDIRECTION)? \ -			phys_to_virt((entry & PAGE_MASK)): ptr +1) +		ptr = (entry & IND_INDIRECTION) ? \ +			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)  static void kimage_free_entry(kimage_entry_t entry)  { @@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image)  			 * done with it.  			 */  			ind = entry; -		} -		else if (entry & IND_SOURCE) +		} else if (entry & IND_SOURCE)  			kimage_free_entry(entry);  	}  	/* Free the final indirection page */ @@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image,  			addr = old_addr;  			page = old_page;  			break; -		} -		else { +		} else {  			/* Place the page on the destination list I  			 * will use it later.  			 */ @@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,  		return -EINVAL;  	ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); -	for (i=0; i < nr_segments; i++) { +	for (i = 0; i < nr_segments; i++) {  		result = copy_from_user(&in, &segments[i], sizeof(in));  		if (result)  			return -EFAULT; @@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)  	 * squirrelled away.  ELF notes happen to provide  	 * all of that, so there is no need to invent something new.  	 */ -	buf = (u32*)per_cpu_ptr(crash_notes, cpu); +	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);  	if (!buf)  		return;  	memset(&prstatus, 0, sizeof(prstatus));  	prstatus.pr_pid = current->pid;  	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);  	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, -		      	      &prstatus, sizeof(prstatus)); +			      &prstatus, sizeof(prstatus));  	final_note(buf);  } @@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void)  	/* Allocate memory for saving cpu registers. */  	crash_notes = alloc_percpu(note_buf_t);  	if (!crash_notes) { -		printk("Kexec: Memory allocation for saving cpu register" -		" states failed\n"); +		pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");  		return -ENOMEM;  	}  	return 0; @@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init);   *   * The function returns 0 on success and -EINVAL on failure.   */ -static int __init parse_crashkernel_mem(char 			*cmdline, -					unsigned long long	system_ram, -					unsigned long long	*crash_size, -					unsigned long long	*crash_base) +static int __init parse_crashkernel_mem(char *cmdline, +					unsigned long long system_ram, +					unsigned long long *crash_size, +					unsigned long long *crash_base)  {  	char *cur = cmdline, *tmp; @@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char 			*cmdline,  		/* get the start of the range */  		start = memparse(cur, &tmp);  		if (cur == tmp) { -			pr_warning("crashkernel: Memory value expected\n"); +			pr_warn("crashkernel: Memory value expected\n");  			return -EINVAL;  		}  		cur = tmp;  		if (*cur != '-') { -			pr_warning("crashkernel: '-' expected\n"); +			pr_warn("crashkernel: '-' expected\n");  			return -EINVAL;  		}  		cur++; @@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char 			*cmdline,  		if (*cur != ':') {  			end = memparse(cur, &tmp);  			if (cur == tmp) { -				pr_warning("crashkernel: Memory " -						"value expected\n"); +				pr_warn("crashkernel: Memory value expected\n");  				return -EINVAL;  			}  			cur = tmp;  			if (end <= start) { -				pr_warning("crashkernel: end <= start\n"); +				pr_warn("crashkernel: end <= start\n");  				return -EINVAL;  			}  		}  		if (*cur != ':') { -			pr_warning("crashkernel: ':' expected\n"); +			pr_warn("crashkernel: ':' expected\n");  			return -EINVAL;  		}  		cur++;  		size = memparse(cur, &tmp);  		if (cur == tmp) { -			pr_warning("Memory value expected\n"); +			pr_warn("Memory value expected\n");  			return -EINVAL;  		}  		cur = tmp;  		if (size >= system_ram) { -			pr_warning("crashkernel: invalid size\n"); +			pr_warn("crashkernel: invalid size\n");  			return -EINVAL;  		} @@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,  			cur++;  			*crash_base = memparse(cur, &tmp);  			if (cur == tmp) { -				pr_warning("Memory value expected " -						"after '@'\n"); +				pr_warn("Memory value expected after '@'\n");  				return -EINVAL;  			}  		} @@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char 			*cmdline,  /*   * That function parses "simple" (old) crashkernel command lines like   * - * 	crashkernel=size[@offset] + *	crashkernel=size[@offset]   *   * It returns 0 on success and -EINVAL on failure.   */ -static int __init parse_crashkernel_simple(char 		*cmdline, -					   unsigned long long 	*crash_size, -					   unsigned long long 	*crash_base) +static int __init parse_crashkernel_simple(char *cmdline, +					   unsigned long long *crash_size, +					   unsigned long long *crash_base)  {  	char *cur = cmdline;  	*crash_size = memparse(cmdline, &cur);  	if (cmdline == cur) { -		pr_warning("crashkernel: memory value expected\n"); +		pr_warn("crashkernel: memory value expected\n");  		return -EINVAL;  	}  	if (*cur == '@')  		*crash_base = memparse(cur+1, &cur);  	else if (*cur != ' ' && *cur != '\0') { -		pr_warning("crashkernel: unrecognized char\n"); +		pr_warn("crashkernel: unrecognized char\n");  		return -EINVAL;  	} @@ -1683,7 +1678,15 @@ int kernel_kexec(void)  		kexec_in_progress = true;  		kernel_restart_prepare(NULL);  		migrate_to_reboot_cpu(); -		printk(KERN_EMERG "Starting new kernel\n"); + +		/* +		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that +		 * no further code needs to use CPU hotplug (which is true in +		 * the reboot case). However, the kexec path depends on using +		 * CPU hotplug again; so re-enable it here. +		 */ +		cpu_hotplug_enable(); +		pr_emerg("Starting new kernel\n");  		machine_shutdown();  	} diff --git a/kernel/kmod.c b/kernel/kmod.c index 0ac67a5861c5..8637e041a247 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -285,10 +285,7 @@ static int wait_for_helper(void *data)  	pid_t pid;  	/* If SIGCLD is ignored sys_wait4 won't populate the status. */ -	spin_lock_irq(¤t->sighand->siglock); -	current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; -	spin_unlock_irq(¤t->sighand->siglock); - +	kernel_sigaction(SIGCHLD, SIG_DFL);  	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);  	if (pid < 0) {  		sub_info->retval = pid; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2495a9b14ac8..6683ccef9fff 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,  }  KERNEL_ATTR_RO(uevent_seqnum); +#ifdef CONFIG_UEVENT_HELPER  /* uevent helper program, used during early boot */  static ssize_t uevent_helper_show(struct kobject *kobj,  				  struct kobj_attribute *attr, char *buf) @@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,  	return count;  }  KERNEL_ATTR_RW(uevent_helper); - +#endif  #ifdef CONFIG_PROFILING  static ssize_t profiling_show(struct kobject *kobj, @@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);  static struct attribute * kernel_attrs[] = {  	&fscaps_attr.attr,  	&uevent_seqnum_attr.attr, +#ifdef CONFIG_UEVENT_HELPER  	&uevent_helper_attr.attr, +#endif  #ifdef CONFIG_PROFILING  	&profiling_attr.attr,  #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)   * kthread_stop() has been called).  The return value should be zero   * or a negative error number; it will be passed to kthread_stop().   * - * Returns a task_struct or ERR_PTR(-ENOMEM). + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).   */  struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),  					   void *data, int node, @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),  		 * that thread.  		 */  		if (xchg(&create->done, NULL)) -			return ERR_PTR(-ENOMEM); +			return ERR_PTR(-EINTR);  		/*  		 * kthreadd (or new kernel thread) will call complete()  		 * shortly. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)  }  static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +account_global_scheduler_latency(struct task_struct *tsk, +				 struct latency_record *lat)  {  	int firstnonnull = MAXLR + 1;  	int i; @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)  					break;  				seq_printf(m, " %ps", (void *)bt);  			} -			seq_printf(m, "\n"); +			seq_puts(m, "\n");  		}  	}  	return 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b0e9467922e1..d24e4339b46d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)  }  EXPORT_SYMBOL_GPL(debug_show_held_locks); -asmlinkage void lockdep_sys_exit(void) +asmlinkage __visible void lockdep_sys_exit(void)  {  	struct task_struct *curr = current; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..0955b885d0dc 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -82,14 +82,14 @@ struct lock_writer_stress_stats {  };  static struct lock_writer_stress_stats *lwsa; -#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) +#if defined(MODULE)  #define LOCKTORTURE_RUNNABLE_INIT 1  #else  #define LOCKTORTURE_RUNNABLE_INIT 0  #endif  int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;  module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");  /* Forward reference. */  static void lock_torture_cleanup(void); @@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg)  	static DEFINE_TORTURE_RANDOM(rand);  	VERBOSE_TOROUT_STRING("lock_torture_writer task started"); -	set_user_nice(current, 19); +	set_user_nice(current, MAX_NICE);  	do { -		schedule_timeout_uninterruptible(1); +		if ((torture_random(&rand) & 0xfffff) == 0) +			schedule_timeout_uninterruptible(1);  		cur_ops->writelock();  		if (WARN_ON_ONCE(lock_is_write_held))  			lwsp->n_write_lock_fail++; @@ -354,7 +355,8 @@ static int __init lock_torture_init(void)  		&lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,  	}; -	torture_init_begin(torture_type, verbose, &locktorture_runnable); +	if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) +		return -EBUSY;  	/* Process args and tell the world that the torturer is on the job. */  	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b594..a620d4d08ca6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  	 * top_waiter can be NULL, when we are in the deboosting  	 * mode!  	 */ -	if (top_waiter && (!task_has_pi_waiters(task) || -			   top_waiter != task_top_pi_waiter(task))) -		goto out_unlock_pi; +	if (top_waiter) { +		if (!task_has_pi_waiters(task)) +			goto out_unlock_pi; +		/* +		 * If deadlock detection is off, we stop here if we +		 * are not the top pi waiter of the task. +		 */ +		if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) +			goto out_unlock_pi; +	}  	/*  	 * When deadlock detection is off then we check, if further @@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  		goto retry;  	} -	/* Deadlock detection */ +	/* +	 * Deadlock detection. If the lock is the same as the original +	 * lock which caused us to walk the lock chain or if the +	 * current lock is owned by the task which initiated the chain +	 * walk, we detected a deadlock. +	 */  	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {  		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);  		raw_spin_unlock(&lock->wait_lock); @@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  	unsigned long flags;  	int chain_walk = 0, res; +	/* +	 * Early deadlock detection. We really don't want the task to +	 * enqueue on itself just to untangle the mess later. It's not +	 * only an optimization. We drop the locks, so another waiter +	 * can come in before the chain walk detects the deadlock. So +	 * the other will detect the deadlock and return -EDEADLOCK, +	 * which is wrong, as the other waiter is not in a deadlock +	 * situation. +	 */ +	if (detect_deadlock && owner == task) +		return -EDEADLK; +  	raw_spin_lock_irqsave(&task->pi_lock, flags);  	__rt_mutex_adjust_prio(task);  	waiter->task = task; diff --git a/kernel/module.c b/kernel/module.c index 11869408f79b..81e727cf6df9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  		return -EFAULT;  	name[MODULE_NAME_LEN-1] = '\0'; -	if (!(flags & O_NONBLOCK)) -		pr_warn("waiting module removal not supported: please upgrade\n"); -  	if (mutex_lock_interruptible(&module_mutex) != 0)  		return -EINTR; @@ -3023,21 +3020,6 @@ static int do_init_module(struct module *mod)  	 */  	current->flags &= ~PF_USED_ASYNC; -	blocking_notifier_call_chain(&module_notify_list, -			MODULE_STATE_COMING, mod); - -	/* Set RO and NX regions for core */ -	set_section_ro_nx(mod->module_core, -				mod->core_text_size, -				mod->core_ro_size, -				mod->core_size); - -	/* Set RO and NX regions for init */ -	set_section_ro_nx(mod->module_init, -				mod->init_text_size, -				mod->init_ro_size, -				mod->init_size); -  	do_mod_ctors(mod);  	/* Start the module */  	if (mod->init != NULL) @@ -3168,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info)  	/* This relies on module_mutex for list integrity. */  	module_bug_finalize(info->hdr, info->sechdrs, mod); +	/* Set RO and NX regions for core */ +	set_section_ro_nx(mod->module_core, +				mod->core_text_size, +				mod->core_ro_size, +				mod->core_size); + +	/* Set RO and NX regions for init */ +	set_section_ro_nx(mod->module_init, +				mod->init_text_size, +				mod->init_ro_size, +				mod->init_size); +  	/* Mark state as coming so strong_try_module_get() ignores us,  	 * but kallsyms etc. can see us. */  	mod->state = MODULE_STATE_COMING; +	mutex_unlock(&module_mutex); + +	blocking_notifier_call_chain(&module_notify_list, +				     MODULE_STATE_COMING, mod); +	return 0;  out:  	mutex_unlock(&module_mutex); @@ -3193,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs,  {  	struct module *mod;  	long err; +	char *after_dashes;  	err = module_sig_check(info);  	if (err) @@ -3271,16 +3271,24 @@ static int load_module(struct load_info *info, const char __user *uargs,  	dynamic_debug_setup(info->debug, info->num_debug); +	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ +	ftrace_module_init(mod); +  	/* Finally it's fully formed, ready to start executing. */  	err = complete_formation(mod, info);  	if (err)  		goto ddebug_cleanup;  	/* Module is ready to execute: parsing args may do that. */ -	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -			 -32768, 32767, unknown_module_param_cb); -	if (err < 0) +	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, +				  -32768, 32767, unknown_module_param_cb); +	if (IS_ERR(after_dashes)) { +		err = PTR_ERR(after_dashes);  		goto bug_cleanup; +	} else if (after_dashes) { +		pr_warn("%s: parameters '%s' after `--' ignored\n", +		       mod->name, after_dashes); +	}  	/* Link in to syfs. */  	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); diff --git a/kernel/panic.c b/kernel/panic.c index d02fa9fef46a..62e16cef9cc2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ static unsigned long tainted_mask;  static int pause_on_oops;  static int pause_on_oops_flag;  static DEFINE_SPINLOCK(pause_on_oops_lock); +static bool crash_kexec_post_notifiers;  int panic_timeout = CONFIG_PANIC_TIMEOUT;  EXPORT_SYMBOL_GPL(panic_timeout); @@ -112,9 +113,11 @@ void panic(const char *fmt, ...)  	/*  	 * If we have crashed and we have a crash kernel loaded let it handle  	 * everything else. -	 * Do we want to call this before we try to display a message? +	 * If we want to run this after calling panic_notifiers, pass +	 * the "crash_kexec_post_notifiers" option to the kernel.  	 */ -	crash_kexec(NULL); +	if (!crash_kexec_post_notifiers) +		crash_kexec(NULL);  	/*  	 * Note smp_send_stop is the usual smp shutdown function, which @@ -131,6 +134,15 @@ void panic(const char *fmt, ...)  	kmsg_dump(KMSG_DUMP_PANIC); +	/* +	 * If you doubt kdump always works fine in any situation, +	 * "crash_kexec_post_notifiers" offers you a chance to run +	 * panic_notifiers and dumping kmsg before kdump. +	 * Note: since some panic_notifiers can make crashed kernel +	 * more unstable, it can increase risks of the kdump failure too. +	 */ +	crash_kexec(NULL); +  	bust_spinlocks(0);  	if (!panic_blink) @@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);  core_param(panic, panic_timeout, int, 0644);  core_param(pause_on_oops, pause_on_oops, int, 0644); +static int __init setup_crash_kexec_post_notifiers(char *s) +{ +	crash_kexec_post_notifiers = true; +	return 0; +} +early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); +  static int __init oops_setup(char *s)  {  	if (!s) diff --git a/kernel/params.c b/kernel/params.c index b00142e7f3ba..1e52ca233fd9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)  }  /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *doing, -	       char *args, -	       const struct kernel_param *params, -	       unsigned num, -	       s16 min_level, -	       s16 max_level, -	       int (*unknown)(char *param, char *val, const char *doing)) +char *parse_args(const char *doing, +		 char *args, +		 const struct kernel_param *params, +		 unsigned num, +		 s16 min_level, +		 s16 max_level, +		 int (*unknown)(char *param, char *val, const char *doing))  {  	char *param, *val; @@ -198,6 +198,9 @@ int parse_args(const char *doing,  		int irq_was_disabled;  		args = next_arg(args, ¶m, &val); +		/* Stop at -- */ +		if (!val && strcmp(param, "--") == 0) +			return args;  		irq_was_disabled = irqs_disabled();  		ret = parse_one(param, val, doing, params, num,  				min_level, max_level, unknown); @@ -208,22 +211,22 @@ int parse_args(const char *doing,  		switch (ret) {  		case -ENOENT:  			pr_err("%s: Unknown parameter `%s'\n", doing, param); -			return ret; +			return ERR_PTR(ret);  		case -ENOSPC:  			pr_err("%s: `%s' too large for parameter `%s'\n",  			       doing, val ?: "", param); -			return ret; +			return ERR_PTR(ret);  		case 0:  			break;  		default:  			pr_err("%s: `%s' invalid for parameter `%s'\n",  			       doing, val ?: "", param); -			return ret; +			return ERR_PTR(ret);  		}  	}  	/* All parsed OK. */ -	return 0; +	return NULL;  }  /* Lazy bastard, eh? */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc79b3d..9a83d780facd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -257,8 +257,7 @@ config ARCH_HAS_OPP  	bool  config PM_OPP -	bool "Operating Performance Point (OPP) Layer library" -	depends on ARCH_HAS_OPP +	bool  	---help---  	  SOCs have a standard set of tuples consisting of frequency and  	  voltage pairs that the device will support per voltage domain. This diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4f2073711d3..49e0a20fd010 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@  #include <linux/syscore_ops.h>  #include <linux/ctype.h>  #include <linux/genhd.h> +#include <trace/events/power.h>  #include "power.h" @@ -35,7 +36,7 @@  static int nocompress;  static int noresume;  static int resume_wait; -static int resume_delay; +static unsigned int resume_delay;  static char resume_file[256] = CONFIG_PM_STD_PARTITION;  dev_t swsusp_resume_device;  sector_t swsusp_resume_block; @@ -228,19 +229,23 @@ static void platform_recover(int platform_mode)  void swsusp_show_speed(struct timeval *start, struct timeval *stop,  			unsigned nr_pages, char *msg)  { -	s64 elapsed_centisecs64; -	int centisecs; -	int k; -	int kps; +	u64 elapsed_centisecs64; +	unsigned int centisecs; +	unsigned int k; +	unsigned int kps;  	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); +	/* +	 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, +	 * it is obvious enough for what went wrong. +	 */  	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);  	centisecs = elapsed_centisecs64;  	if (centisecs == 0)  		centisecs = 1;	/* avoid div-by-zero */  	k = nr_pages * (PAGE_SIZE / 1024);  	kps = (k * 100) / centisecs; -	printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", +	printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",  			msg, k,  			centisecs / 100, centisecs % 100,  			kps / 1000, (kps % 1000) / 10); @@ -288,7 +293,9 @@ static int create_image(int platform_mode)  	in_suspend = 1;  	save_processor_state(); +	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);  	error = swsusp_arch_suspend(); +	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);  	if (error)  		printk(KERN_ERR "PM: Error %d creating hibernation image\n",  			error); @@ -595,7 +602,8 @@ static void power_down(void)  	case HIBERNATION_PLATFORM:  		hibernation_platform_enter();  	case HIBERNATION_SHUTDOWN: -		kernel_power_off(); +		if (pm_power_off) +			kernel_power_off();  		break;  #ifdef CONFIG_SUSPEND  	case HIBERNATION_SUSPEND: @@ -623,7 +631,8 @@ static void power_down(void)  	 * corruption after resume.  	 */  	printk(KERN_CRIT "PM: Please power down manually\n"); -	while(1); +	while (1) +		cpu_relax();  }  /** @@ -1109,7 +1118,10 @@ static int __init resumewait_setup(char *str)  static int __init resumedelay_setup(char *str)  { -	resume_delay = simple_strtoul(str, NULL, 0); +	int rc = kstrtouint(str, 0, &resume_delay); + +	if (rc) +		return rc;  	return 1;  } diff --git a/kernel/power/main.c b/kernel/power/main.c index 6271bc4073ef..573410d6647e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {}  struct kobject *power_kobj;  /** - *	state - control system power state. + * state - control system sleep states.   * - *	show() returns what states are supported, which is hard-coded to - *	'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), - *	'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation).  See Documentation/power/states.txt for a + * description of what they mean.   * - *	store() accepts one of those strings, translates it into the - *	proper enumerated value, and initiates a suspend transition. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition.   */  static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,  			  char *buf)  {  	char *s = buf;  #ifdef CONFIG_SUSPEND -	int i; +	suspend_state_t i; + +	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) +		if (pm_states[i].state) +			s += sprintf(s,"%s ", pm_states[i].label); -	for (i = 0; i < PM_SUSPEND_MAX; i++) { -		if (pm_states[i] && valid_state(i)) -			s += sprintf(s,"%s ", pm_states[i]); -	}  #endif  #ifdef CONFIG_HIBERNATION  	s += sprintf(s, "%s\n", "disk"); @@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)  {  #ifdef CONFIG_SUSPEND  	suspend_state_t state = PM_SUSPEND_MIN; -	const char * const *s; +	struct pm_sleep_state *s;  #endif  	char *p;  	int len; @@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)  #ifdef CONFIG_SUSPEND  	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) -		if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) -			return state; +		if (s->state && len == strlen(s->label) +		    && !strncmp(buf, s->label, len)) +			return s->state;  #endif  	return PM_SUSPEND_ON; @@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj,  #ifdef CONFIG_SUSPEND  	if (state < PM_SUSPEND_MAX) -		return sprintf(buf, "%s\n", valid_state(state) ? -						pm_states[state] : "error"); +		return sprintf(buf, "%s\n", pm_states[state].state ? +					pm_states[state].label : "error");  #endif  #ifdef CONFIG_HIBERNATION  	return sprintf(buf, "disk\n"); diff --git a/kernel/power/power.h b/kernel/power/power.h index 15f37ea08719..c60f13b5270a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,  				unsigned int, char *);  #ifdef CONFIG_SUSPEND +struct pm_sleep_state { +	const char *label; +	suspend_state_t state; +}; +  /* kernel/power/suspend.c */ -extern const char *const pm_states[]; +extern struct pm_sleep_state pm_states[]; -extern bool valid_state(suspend_state_t state);  extern int suspend_devices_and_enter(suspend_state_t state);  #else /* !CONFIG_SUSPEND */  static inline int suspend_devices_and_enter(suspend_state_t state)  {  	return -ENOSYS;  } -static inline bool valid_state(suspend_state_t state) { return false; }  #endif /* !CONFIG_SUSPEND */  #ifdef CONFIG_PM_TEST_SUSPEND diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869dbf1..0ca8d83e2369 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -17,6 +17,7 @@  #include <linux/delay.h>  #include <linux/workqueue.h>  #include <linux/kmod.h> +#include <trace/events/power.h>  /*    * Timeout for stopping processes @@ -175,6 +176,7 @@ void thaw_processes(void)  	struct task_struct *g, *p;  	struct task_struct *curr = current; +	trace_suspend_resume(TPS("thaw_processes"), 0, true);  	if (pm_freezing)  		atomic_dec(&system_freezing_cnt);  	pm_freezing = false; @@ -201,6 +203,7 @@ void thaw_processes(void)  	schedule();  	printk("done.\n"); +	trace_suspend_resume(TPS("thaw_processes"), 0, false);  }  void thaw_kernel_threads(void) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 18fb7a2fb14b..1ea328aafdc9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,  	return -ENOMEM;  } -asmlinkage int swsusp_save(void) +asmlinkage __visible int swsusp_save(void)  {  	unsigned int nr_pages, nr_highmem; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c3ad9cafe930..4dd8822f732a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -14,6 +14,7 @@  #include <linux/init.h>  #include <linux/console.h>  #include <linux/cpu.h> +#include <linux/cpuidle.h>  #include <linux/syscalls.h>  #include <linux/gfp.h>  #include <linux/io.h> @@ -30,13 +31,14 @@  #include "power.h" -const char *const pm_states[PM_SUSPEND_MAX] = { -	[PM_SUSPEND_FREEZE]	= "freeze", -	[PM_SUSPEND_STANDBY]	= "standby", -	[PM_SUSPEND_MEM]	= "mem", +struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { +	[PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, +	[PM_SUSPEND_STANDBY] = { .label = "standby", }, +	[PM_SUSPEND_MEM] = { .label = "mem", },  };  static const struct platform_suspend_ops *suspend_ops; +static const struct platform_freeze_ops *freeze_ops;  static bool need_suspend_ops(suspend_state_t state)  { @@ -46,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state)  static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);  static bool suspend_freeze_wake; +void freeze_set_ops(const struct platform_freeze_ops *ops) +{ +	lock_system_sleep(); +	freeze_ops = ops; +	unlock_system_sleep(); +} +  static void freeze_begin(void)  {  	suspend_freeze_wake = false; @@ -53,7 +62,11 @@ static void freeze_begin(void)  static void freeze_enter(void)  { +	cpuidle_use_deepest_state(true); +	cpuidle_resume();  	wait_event(suspend_freeze_wait_head, suspend_freeze_wake); +	cpuidle_pause(); +	cpuidle_use_deepest_state(false);  }  void freeze_wake(void) @@ -63,42 +76,62 @@ void freeze_wake(void)  }  EXPORT_SYMBOL_GPL(freeze_wake); +static bool valid_state(suspend_state_t state) +{ +	/* +	 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level +	 * support and need to be valid to the low level +	 * implementation, no valid callback implies that none are valid. +	 */ +	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ +	relative_states = !strncmp(str, "1", 1); +	if (relative_states) { +		pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; +		pm_states[PM_SUSPEND_FREEZE].state = 0; +	} +	return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); +  /**   * suspend_set_ops - Set the global suspend method table.   * @ops: Suspend operations to use.   */  void suspend_set_ops(const struct platform_suspend_ops *ops)  { +	suspend_state_t i; +	int j = PM_SUSPEND_MAX - 1; +  	lock_system_sleep(); +  	suspend_ops = ops; +	for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) +		if (valid_state(i)) +			pm_states[j--].state = i; +		else if (!relative_states) +			pm_states[j--].state = 0; + +	pm_states[j--].state = PM_SUSPEND_FREEZE; +	while (j >= PM_SUSPEND_MIN) +		pm_states[j--].state = 0; +  	unlock_system_sleep();  }  EXPORT_SYMBOL_GPL(suspend_set_ops); -bool valid_state(suspend_state_t state) -{ -	if (state == PM_SUSPEND_FREEZE) { -#ifdef CONFIG_PM_DEBUG -		if (pm_test_level != TEST_NONE && -		    pm_test_level != TEST_FREEZER && -		    pm_test_level != TEST_DEVICES && -		    pm_test_level != TEST_PLATFORM) { -			printk(KERN_WARNING "Unsupported pm_test mode for " -					"freeze state, please choose " -					"none/freezer/devices/platform.\n"); -			return false; -		} -#endif -			return true; -	} -	/* -	 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel -	 * support and need to be valid to the lowlevel -	 * implementation, no valid callback implies that none are valid. -	 */ -	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); -} -  /**   * suspend_valid_only_mem - Generic memory-only valid callback.   * @@ -144,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)  	if (error)  		goto Finish; +	trace_suspend_resume(TPS("freeze_processes"), 0, true);  	error = suspend_freeze_processes(); +	trace_suspend_resume(TPS("freeze_processes"), 0, false);  	if (!error)  		return 0; @@ -207,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	 * all the devices are suspended.  	 */  	if (state == PM_SUSPEND_FREEZE) { +		trace_suspend_resume(TPS("machine_suspend"), state, true);  		freeze_enter(); +		trace_suspend_resume(TPS("machine_suspend"), state, false);  		goto Platform_wake;  	} @@ -223,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	if (!error) {  		*wakeup = pm_wakeup_pending();  		if (!(suspend_test(TEST_CORE) || *wakeup)) { +			trace_suspend_resume(TPS("machine_suspend"), +				state, true);  			error = suspend_ops->enter(state); +			trace_suspend_resume(TPS("machine_suspend"), +				state, false);  			events_check_enabled = false;  		}  		syscore_resume(); @@ -261,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state)  	if (need_suspend_ops(state) && !suspend_ops)  		return -ENOSYS; -	trace_machine_suspend(state);  	if (need_suspend_ops(state) && suspend_ops->begin) {  		error = suspend_ops->begin(state);  		if (error)  			goto Close; +	} else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { +		error = freeze_ops->begin(); +		if (error) +			goto Close;  	}  	suspend_console();  	suspend_test_start(); @@ -291,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state)   Close:  	if (need_suspend_ops(state) && suspend_ops->end)  		suspend_ops->end(); -	trace_machine_suspend(PWR_EVENT_EXIT); +	else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) +		freeze_ops->end(); +  	return error;   Recover_platform: @@ -325,20 +371,31 @@ static int enter_state(suspend_state_t state)  {  	int error; -	if (!valid_state(state)) -		return -ENODEV; - +	trace_suspend_resume(TPS("suspend_enter"), state, true); +	if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG +		if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { +			pr_warning("PM: Unsupported test mode for freeze state," +				   "please choose none/freezer/devices/platform.\n"); +			return -EAGAIN; +		} +#endif +	} else if (!valid_state(state)) { +		return -EINVAL; +	}  	if (!mutex_trylock(&pm_mutex))  		return -EBUSY;  	if (state == PM_SUSPEND_FREEZE)  		freeze_begin(); +	trace_suspend_resume(TPS("sync_filesystems"), 0, true);  	printk(KERN_INFO "PM: Syncing filesystems ... ");  	sys_sync();  	printk("done.\n"); +	trace_suspend_resume(TPS("sync_filesystems"), 0, false); -	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); +	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);  	error = suspend_prepare(state);  	if (error)  		goto Unlock; @@ -346,7 +403,8 @@ static int enter_state(suspend_state_t state)  	if (suspend_test(TEST_FREEZER))  		goto Finish; -	pr_debug("PM: Entering %s sleep\n", pm_states[state]); +	trace_suspend_resume(TPS("suspend_enter"), state, false); +	pr_debug("PM: Entering %s sleep\n", pm_states[state].label);  	pm_restrict_gfp_mask();  	error = suspend_devices_and_enter(state);  	pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558d..269b097e78ea 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)  	}  	if (state == PM_SUSPEND_MEM) { -		printk(info_test, pm_states[state]); +		printk(info_test, pm_states[state].label);  		status = pm_suspend(state);  		if (status == -ENODEV)  			state = PM_SUSPEND_STANDBY;  	}  	if (state == PM_SUSPEND_STANDBY) { -		printk(info_test, pm_states[state]); +		printk(info_test, pm_states[state].label);  		status = pm_suspend(state);  	}  	if (status < 0) @@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =  static int __init setup_test_suspend(char *value)  { -	unsigned i; +	suspend_state_t i;  	/* "=mem" ==> "mem" */  	value++; -	for (i = 0; i < PM_SUSPEND_MAX; i++) { -		if (!pm_states[i]) -			continue; -		if (strcmp(pm_states[i], value) != 0) -			continue; -		test_state = (__force suspend_state_t) i; -		return 0; -	} +	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) +		if (!strcmp(pm_states[i].label, value)) { +			test_state = pm_states[i].state; +			return 0; +		} +  	printk(warn_bad_state, value);  	return 0;  } @@ -164,8 +162,8 @@ static int __init test_suspend(void)  	/* PM is initialized by now; is that state testable? */  	if (test_state == PM_SUSPEND_ON)  		goto done; -	if (!valid_state(test_state)) { -		printk(warn_bad_state, pm_states[test_state]); +	if (!pm_states[test_state].state) { +		printk(warn_bad_state, pm_states[test_state].label);  		goto done;  	} diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c9a4819f798..aaa3261dea5d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)  /**   * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap mam handle to use for saving the image. + * @handle: Swap map handle to use for saving the image.   * @snapshot: Image to read data from.   * @nr_to_write: Number of pages to save.   */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a45b50962295..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,20 +54,16 @@  #include "console_cmdline.h"  #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -  int console_printk[4] = { -	DEFAULT_CONSOLE_LOGLEVEL,	/* console_loglevel */ +	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */  	DEFAULT_MESSAGE_LOGLEVEL,	/* default_message_loglevel */ -	MINIMUM_CONSOLE_LOGLEVEL,	/* minimum_console_loglevel */ -	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */ +	CONSOLE_LOGLEVEL_MIN,		/* minimum_console_loglevel */ +	CONSOLE_LOGLEVEL_DEFAULT,	/* default_console_loglevel */  }; +/* Deferred messaged from sched code are marked by this special level */ +#define SCHED_MESSAGE_LOGLEVEL -2 +  /*   * Low level drivers may need that to know if they can schedule in   * their unblank() callback or not. So let's export it. @@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = {  #endif  /* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. + */ +#define down_console_sem() do { \ +	down(&console_sem);\ +	mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ +} while (0) + +static int __down_trylock_console_sem(unsigned long ip) +{ +	if (down_trylock(&console_sem)) +		return 1; +	mutex_acquire(&console_lock_dep_map, 0, 1, ip); +	return 0; +} +#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) + +#define up_console_sem() do { \ +	mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ +	up(&console_sem);\ +} while (0) + +/*   * This is used for debugging the mess that is the VT code by   * keeping track if we have the console semaphore held. It's   * definitely not the perfect debug tool (we don't know if _WE_ @@ -206,8 +225,9 @@ struct printk_log {  };  /* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); + * The logbuf_lock protects kmsg buffer, indices, counters.  This can be taken + * within the scheduler's rq lock. It must be released before calling + * console_unlock() or anything else that might wake up a process.   */  static DEFINE_RAW_SPINLOCK(logbuf_lock); @@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);  static char *log_buf = __log_buf;  static u32 log_buf_len = __LOG_BUF_LEN; -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; -  /* human readable text of the record */  static char *log_text(const struct printk_log *msg)  { @@ -297,34 +314,106 @@ static u32 log_next(u32 idx)  	return idx + msg->len;  } -/* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, -		      enum log_flags flags, u64 ts_nsec, -		      const char *dict, u16 dict_len, -		      const char *text, u16 text_len) +/* + * Check whether there is enough free space for the given message. + * + * The same values of first_idx and next_idx mean that the buffer + * is either empty or full. + * + * If the buffer is empty, we must respect the position of the indexes. + * They cannot be reset to the beginning of the buffer. + */ +static int logbuf_has_space(u32 msg_size, bool empty)  { -	struct printk_log *msg; -	u32 size, pad_len; +	u32 free; -	/* number of '\0' padding bytes to next message */ -	size = sizeof(struct printk_log) + text_len + dict_len; -	pad_len = (-size) & (LOG_ALIGN - 1); -	size += pad_len; +	if (log_next_idx > log_first_idx || empty) +		free = max(log_buf_len - log_next_idx, log_first_idx); +	else +		free = log_first_idx - log_next_idx; + +	/* +	 * We need space also for an empty header that signalizes wrapping +	 * of the buffer. +	 */ +	return free >= msg_size + sizeof(struct printk_log); +} +static int log_make_free_space(u32 msg_size) +{  	while (log_first_seq < log_next_seq) { -		u32 free; +		if (logbuf_has_space(msg_size, false)) +			return 0; +		/* drop old messages until we have enough continuous space */ +		log_first_idx = log_next(log_first_idx); +		log_first_seq++; +	} -		if (log_next_idx > log_first_idx) -			free = max(log_buf_len - log_next_idx, log_first_idx); -		else -			free = log_first_idx - log_next_idx; +	/* sequence numbers are equal, so the log buffer is empty */ +	if (logbuf_has_space(msg_size, true)) +		return 0; -		if (free >= size + sizeof(struct printk_log)) -			break; +	return -ENOMEM; +} -		/* drop old messages until we have enough contiuous space */ -		log_first_idx = log_next(log_first_idx); -		log_first_seq++; +/* compute the message size including the padding bytes */ +static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +{ +	u32 size; + +	size = sizeof(struct printk_log) + text_len + dict_len; +	*pad_len = (-size) & (LOG_ALIGN - 1); +	size += *pad_len; + +	return size; +} + +/* + * Define how much of the log buffer we could take at maximum. The value + * must be greater than two. Note that only half of the buffer is available + * when the index points to the middle. + */ +#define MAX_LOG_TAKE_PART 4 +static const char trunc_msg[] = "<truncated>"; + +static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, +			u16 *dict_len, u32 *pad_len) +{ +	/* +	 * The message should not take the whole buffer. Otherwise, it might +	 * get removed too soon. +	 */ +	u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; +	if (*text_len > max_text_len) +		*text_len = max_text_len; +	/* enable the warning message */ +	*trunc_msg_len = strlen(trunc_msg); +	/* disable the "dict" completely */ +	*dict_len = 0; +	/* compute the size again, count also the warning message */ +	return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +} + +/* insert record into the buffer, discard old ones, update heads */ +static int log_store(int facility, int level, +		     enum log_flags flags, u64 ts_nsec, +		     const char *dict, u16 dict_len, +		     const char *text, u16 text_len) +{ +	struct printk_log *msg; +	u32 size, pad_len; +	u16 trunc_msg_len = 0; + +	/* number of '\0' padding bytes to next message */ +	size = msg_used_size(text_len, dict_len, &pad_len); + +	if (log_make_free_space(size)) { +		/* truncate the message if it is too long for empty buffer */ +		size = truncate_msg(&text_len, &trunc_msg_len, +				    &dict_len, &pad_len); +		/* survive when the log buffer is too small for trunc_msg */ +		if (log_make_free_space(size)) +			return 0;  	}  	if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { @@ -341,6 +430,10 @@ static void log_store(int facility, int level,  	msg = (struct printk_log *)(log_buf + log_next_idx);  	memcpy(log_text(msg), text, text_len);  	msg->text_len = text_len; +	if (trunc_msg_len) { +		memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); +		msg->text_len += trunc_msg_len; +	}  	memcpy(log_dict(msg), dict, dict_len);  	msg->dict_len = dict_len;  	msg->facility = facility; @@ -356,6 +449,8 @@ static void log_store(int facility, int level,  	/* insert message */  	log_next_idx += msg->len;  	log_next_seq++; + +	return msg->text_len;  }  #ifdef CONFIG_SECURITY_DMESG_RESTRICT @@ -1303,7 +1398,10 @@ static void zap_locks(void)  	sema_init(&console_sem, 1);  } -/* Check if we have any console registered that can be called early in boot. */ +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */  static int have_callable_console(void)  {  	struct console *con; @@ -1318,10 +1416,9 @@ static int have_callable_console(void)  /*   * Can we actually use the console at this time on this cpu?   * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up.   */  static inline int can_use_console(unsigned int cpu)  { @@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu)   * messages from a 'printk'. Return true (and with the   * console_lock held, and 'console_locked' set) if it   * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled.   */ -static int console_trylock_for_printk(unsigned int cpu) -	__releases(&logbuf_lock) +static int console_trylock_for_printk(void)  { -	int retval = 0, wake = 0; +	unsigned int cpu = smp_processor_id(); -	if (console_trylock()) { -		retval = 1; - -		/* -		 * If we can't use the console, we need to release -		 * the console semaphore by hand to avoid flushing -		 * the buffer. We need to hold the console semaphore -		 * in order to do this test safely. -		 */ -		if (!can_use_console(cpu)) { -			console_locked = 0; -			wake = 1; -			retval = 0; -		} +	if (!console_trylock()) +		return 0; +	/* +	 * If we can't use the console, we need to release the console +	 * semaphore by hand to avoid flushing the buffer. We need to hold the +	 * console semaphore in order to do this test safely. +	 */ +	if (!can_use_console(cpu)) { +		console_locked = 0; +		up_console_sem(); +		return 0;  	} -	logbuf_cpu = UINT_MAX; -	raw_spin_unlock(&logbuf_lock); -	if (wake) -		up(&console_sem); -	return retval; +	return 1;  }  int printk_delay_msec __read_mostly; @@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level,  	static int recursion_bug;  	static char textbuf[LOG_LINE_MAX];  	char *text = textbuf; -	size_t text_len; +	size_t text_len = 0;  	enum log_flags lflags = 0;  	unsigned long flags;  	int this_cpu;  	int printed_len = 0; +	bool in_sched = false; +	/* cpu currently holding logbuf_lock in this function */ +	static volatile unsigned int logbuf_cpu = UINT_MAX; + +	if (level == SCHED_MESSAGE_LOGLEVEL) { +		level = -1; +		in_sched = true; +	}  	boot_delay_msec(level);  	printk_delay(); @@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level,  		 */  		if (!oops_in_progress && !lockdep_recursing(current)) {  			recursion_bug = 1; -			goto out_restore_irqs; +			local_irq_restore(flags); +			return 0;  		}  		zap_locks();  	} @@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level,  			"BUG: recent printk recursion!";  		recursion_bug = 0; -		printed_len += strlen(recursion_msg); +		text_len = strlen(recursion_msg);  		/* emit KERN_CRIT message */ -		log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, -			  NULL, 0, recursion_msg, printed_len); +		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, +					 NULL, 0, recursion_msg, text_len);  	}  	/*  	 * The printf needs to come first; we need the syslog  	 * prefix which might be passed-in as a parameter.  	 */ -	text_len = vscnprintf(text, sizeof(textbuf), fmt, args); +	if (in_sched) +		text_len = scnprintf(text, sizeof(textbuf), +				     KERN_WARNING "[sched_delayed] "); + +	text_len += vscnprintf(text + text_len, +			       sizeof(textbuf) - text_len, fmt, args);  	/* mark and strip a trailing newline */  	if (text_len && text[text_len-1] == '\n') { @@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level,  			cont_flush(LOG_NEWLINE);  		/* buffer line if possible, otherwise store it right away */ -		if (!cont_add(facility, level, text, text_len)) -			log_store(facility, level, lflags | LOG_CONT, 0, -				  dict, dictlen, text, text_len); +		if (cont_add(facility, level, text, text_len)) +			printed_len += text_len; +		else +			printed_len += log_store(facility, level, +						 lflags | LOG_CONT, 0, +						 dict, dictlen, text, text_len);  	} else {  		bool stored = false; @@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level,  			cont_flush(LOG_NEWLINE);  		} -		if (!stored) -			log_store(facility, level, lflags, 0, -				  dict, dictlen, text, text_len); +		if (stored) +			printed_len += text_len; +		else +			printed_len += log_store(facility, level, lflags, 0, +						 dict, dictlen, text, text_len);  	} -	printed_len += text_len; + +	logbuf_cpu = UINT_MAX; +	raw_spin_unlock(&logbuf_lock); +	lockdep_on(); +	local_irq_restore(flags); + +	/* If called from the scheduler, we can not call up(). */ +	if (in_sched) +		return printed_len;  	/* +	 * Disable preemption to avoid being preempted while holding +	 * console_sem which would prevent anyone from printing to console +	 */ +	preempt_disable(); +	/*  	 * Try to acquire and then immediately release the console semaphore.  	 * The release will print out buffers and wake up /dev/kmsg and syslog()  	 * users. -	 * -	 * The console_trylock_for_printk() function will release 'logbuf_lock' -	 * regardless of whether it actually gets the console semaphore or not.  	 */ -	if (console_trylock_for_printk(this_cpu)) +	if (console_trylock_for_printk())  		console_unlock(); - -	lockdep_on(); -out_restore_irqs: -	local_irq_restore(flags); +	preempt_enable();  	return printed_len;  } @@ -1674,7 +1785,7 @@ EXPORT_SYMBOL(printk_emit);   *   * See the vsnprintf() documentation for format string extensions over C99.   */ -asmlinkage int printk(const char *fmt, ...) +asmlinkage __visible int printk(const char *fmt, ...)  {  	va_list args;  	int r; @@ -1737,7 +1848,7 @@ void early_vprintk(const char *fmt, va_list ap)  	}  } -asmlinkage void early_printk(const char *fmt, ...) +asmlinkage __visible void early_printk(const char *fmt, ...)  {  	va_list ap; @@ -1882,16 +1993,14 @@ void suspend_console(void)  	printk("Suspending console(s) (use no_console_suspend to debug)\n");  	console_lock();  	console_suspended = 1; -	up(&console_sem); -	mutex_release(&console_lock_dep_map, 1, _RET_IP_); +	up_console_sem();  }  void resume_console(void)  {  	if (!console_suspend_enabled)  		return; -	down(&console_sem); -	mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); +	down_console_sem();  	console_suspended = 0;  	console_unlock();  } @@ -1933,12 +2042,11 @@ void console_lock(void)  {  	might_sleep(); -	down(&console_sem); +	down_console_sem();  	if (console_suspended)  		return;  	console_locked = 1;  	console_may_schedule = 1; -	mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);  }  EXPORT_SYMBOL(console_lock); @@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock);   */  int console_trylock(void)  { -	if (down_trylock(&console_sem)) +	if (down_trylock_console_sem())  		return 0;  	if (console_suspended) { -		up(&console_sem); +		up_console_sem();  		return 0;  	}  	console_locked = 1;  	console_may_schedule = 0; -	mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);  	return 1;  }  EXPORT_SYMBOL(console_trylock); @@ -2022,7 +2129,7 @@ void console_unlock(void)  	bool retry;  	if (console_suspended) { -		up(&console_sem); +		up_console_sem();  		return;  	} @@ -2043,10 +2150,15 @@ again:  		}  		if (console_seq < log_first_seq) { +			len = sprintf(text, "** %u printk messages dropped ** ", +				      (unsigned)(log_first_seq - console_seq)); +  			/* messages are gone, move to first one */  			console_seq = log_first_seq;  			console_idx = log_first_idx;  			console_prev = 0; +		} else { +			len = 0;  		}  skip:  		if (console_seq == log_next_seq) @@ -2071,8 +2183,8 @@ skip:  		}  		level = msg->level; -		len = msg_print_text(msg, console_prev, false, -				     text, sizeof(text)); +		len += msg_print_text(msg, console_prev, false, +				      text + len, sizeof(text) - len);  		console_idx = log_next(console_idx);  		console_seq++;  		console_prev = msg->flags; @@ -2084,7 +2196,6 @@ skip:  		local_irq_restore(flags);  	}  	console_locked = 0; -	mutex_release(&console_lock_dep_map, 1, _RET_IP_);  	/* Release the exclusive_console once it is used */  	if (unlikely(exclusive_console)) @@ -2092,7 +2203,7 @@ skip:  	raw_spin_unlock(&logbuf_lock); -	up(&console_sem); +	up_console_sem();  	/*  	 * Someone could have filled up the buffer again, so re-check if there's @@ -2137,7 +2248,7 @@ void console_unblank(void)  	 * oops_in_progress is set to 1..  	 */  	if (oops_in_progress) { -		if (down_trylock(&console_sem) != 0) +		if (down_trylock_console_sem() != 0)  			return;  	} else  		console_lock(); @@ -2413,6 +2524,7 @@ int unregister_console(struct console *console)  	if (console_drivers != NULL && console->flags & CON_CONSDEV)  		console_drivers->flags |= CON_CONSDEV; +	console->flags &= ~CON_ENABLED;  	console_unlock();  	console_sysfs_notify();  	return res; @@ -2437,21 +2549,19 @@ late_initcall(printk_late_init);  /*   * Delayed printk version, for scheduler-internal messages:   */ -#define PRINTK_BUF_SIZE		512 -  #define PRINTK_PENDING_WAKEUP	0x01 -#define PRINTK_PENDING_SCHED	0x02 +#define PRINTK_PENDING_OUTPUT	0x02  static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);  static void wake_up_klogd_work_func(struct irq_work *irq_work)  {  	int pending = __this_cpu_xchg(printk_pending, 0); -	if (pending & PRINTK_PENDING_SCHED) { -		char *buf = __get_cpu_var(printk_sched_buf); -		pr_warn("[sched_delayed] %s", buf); +	if (pending & PRINTK_PENDING_OUTPUT) { +		/* If trylock fails, someone else is doing the printing */ +		if (console_trylock()) +			console_unlock();  	}  	if (pending & PRINTK_PENDING_WAKEUP) @@ -2473,23 +2583,19 @@ void wake_up_klogd(void)  	preempt_enable();  } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...)  { -	unsigned long flags;  	va_list args; -	char *buf;  	int r; -	local_irq_save(flags); -	buf = __get_cpu_var(printk_sched_buf); - +	preempt_disable();  	va_start(args, fmt); -	r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); +	r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);  	va_end(args); -	__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); +	__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);  	irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); -	local_irq_restore(flags); +	preempt_enable();  	return r;  } diff --git a/kernel/profile.c b/kernel/profile.c index cb980f0c731b..54bf5ba26420 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);  int profile_setup(char *str)  { -	static char schedstr[] = "schedule"; -	static char sleepstr[] = "sleep"; -	static char kvmstr[] = "kvm"; +	static const char schedstr[] = "schedule"; +	static const char sleepstr[] = "sleep"; +	static const char kvmstr[] = "kvm";  	int par;  	if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -64,12 +64,10 @@ int profile_setup(char *str)  			str += strlen(sleepstr) + 1;  		if (get_option(&str, &par))  			prof_shift = par; -		printk(KERN_INFO -			"kernel sleep profiling enabled (shift: %ld)\n", +		pr_info("kernel sleep profiling enabled (shift: %ld)\n",  			prof_shift);  #else -		printk(KERN_WARNING -			"kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); +		pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");  #endif /* CONFIG_SCHEDSTATS */  	} else if (!strncmp(str, schedstr, strlen(schedstr))) {  		prof_on = SCHED_PROFILING; @@ -77,8 +75,7 @@ int profile_setup(char *str)  			str += strlen(schedstr) + 1;  		if (get_option(&str, &par))  			prof_shift = par; -		printk(KERN_INFO -			"kernel schedule profiling enabled (shift: %ld)\n", +		pr_info("kernel schedule profiling enabled (shift: %ld)\n",  			prof_shift);  	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {  		prof_on = KVM_PROFILING; @@ -86,13 +83,12 @@ int profile_setup(char *str)  			str += strlen(kvmstr) + 1;  		if (get_option(&str, &par))  			prof_shift = par; -		printk(KERN_INFO -			"kernel KVM profiling enabled (shift: %ld)\n", +		pr_info("kernel KVM profiling enabled (shift: %ld)\n",  			prof_shift);  	} else if (get_option(&str, &par)) {  		prof_shift = par;  		prof_on = CPU_PROFILING; -		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", +		pr_info("kernel profiling enabled (shift: %ld)\n",  			prof_shift);  	}  	return 1; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bd30bc61bc05..7fa34f86e5ba 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0,  	      "Duration of fqs bursts (us), 0 to disable");  torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");  torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");  torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");  torture_param(bool, gp_normal, false,  	     "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");  torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");  torture_param(int, n_barrier_cbs, 0,  	     "# of callbacks/kthreads for barrier testing"); @@ -138,6 +140,18 @@ static long n_barrier_attempts;  static long n_barrier_successes;  static struct list_head rcu_torture_removed; +static int rcu_torture_writer_state; +#define RTWS_FIXED_DELAY	0 +#define RTWS_DELAY		1 +#define RTWS_REPLACE		2 +#define RTWS_DEF_FREE		3 +#define RTWS_EXP_SYNC		4 +#define RTWS_COND_GET		5 +#define RTWS_COND_SYNC		6 +#define RTWS_SYNC		7 +#define RTWS_STUTTER		8 +#define RTWS_STOPPING		9 +  #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)  #define RCUTORTURE_RUNNABLE_INIT 1  #else @@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p)   */  struct rcu_torture_ops { +	int ttype;  	void (*init)(void);  	int (*readlock)(void);  	void (*read_delay)(struct torture_random_state *rrsp); @@ -222,6 +237,8 @@ struct rcu_torture_ops {  	void (*deferred_free)(struct rcu_torture *p);  	void (*sync)(void);  	void (*exp_sync)(void); +	unsigned long (*get_state)(void); +	void (*cond_sync)(unsigned long oldstate);  	void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));  	void (*cb_barrier)(void);  	void (*fqs)(void); @@ -273,10 +290,48 @@ static int rcu_torture_completed(void)  	return rcu_batches_completed();  } +/* + * Update callback in the pipe.  This should be invoked after a grace period. + */ +static bool +rcu_torture_pipe_update_one(struct rcu_torture *rp) +{ +	int i; + +	i = rp->rtort_pipe_count; +	if (i > RCU_TORTURE_PIPE_LEN) +		i = RCU_TORTURE_PIPE_LEN; +	atomic_inc(&rcu_torture_wcount[i]); +	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { +		rp->rtort_mbtest = 0; +		return true; +	} +	return false; +} + +/* + * Update all callbacks in the pipe.  Suitable for synchronous grace-period + * primitives. + */ +static void +rcu_torture_pipe_update(struct rcu_torture *old_rp) +{ +	struct rcu_torture *rp; +	struct rcu_torture *rp1; + +	if (old_rp) +		list_add(&old_rp->rtort_free, &rcu_torture_removed); +	list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { +		if (rcu_torture_pipe_update_one(rp)) { +			list_del(&rp->rtort_free); +			rcu_torture_free(rp); +		} +	} +} +  static void  rcu_torture_cb(struct rcu_head *p)  { -	int i;  	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);  	if (torture_must_stop_irq()) { @@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p)  		/* The next initialization will pick up the pieces. */  		return;  	} -	i = rp->rtort_pipe_count; -	if (i > RCU_TORTURE_PIPE_LEN) -		i = RCU_TORTURE_PIPE_LEN; -	atomic_inc(&rcu_torture_wcount[i]); -	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { -		rp->rtort_mbtest = 0; +	if (rcu_torture_pipe_update_one(rp))  		rcu_torture_free(rp); -	} else { +	else  		cur_ops->deferred_free(rp); -	}  }  static int rcu_no_completed(void) @@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void)  }  static struct rcu_torture_ops rcu_ops = { +	.ttype		= RCU_FLAVOR,  	.init		= rcu_sync_torture_init,  	.readlock	= rcu_torture_read_lock,  	.read_delay	= rcu_read_delay, @@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {  	.deferred_free	= rcu_torture_deferred_free,  	.sync		= synchronize_rcu,  	.exp_sync	= synchronize_rcu_expedited, +	.get_state	= get_state_synchronize_rcu, +	.cond_sync	= cond_synchronize_rcu,  	.call		= call_rcu,  	.cb_barrier	= rcu_barrier,  	.fqs		= rcu_force_quiescent_state, @@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)  }  static struct rcu_torture_ops rcu_bh_ops = { +	.ttype		= RCU_BH_FLAVOR,  	.init		= rcu_sync_torture_init,  	.readlock	= rcu_bh_torture_read_lock,  	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */ @@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  }  static struct rcu_torture_ops rcu_busted_ops = { +	.ttype		= INVALID_RCU_FLAVOR,  	.init		= rcu_sync_torture_init,  	.readlock	= rcu_torture_read_lock,  	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */ @@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page)  	page += sprintf(page, "%s%s per-CPU(idx=%d):",  		       torture_type, TORTURE_FLAG, idx);  	for_each_possible_cpu(cpu) { -		page += sprintf(page, " %d(%lu,%lu)", cpu, -			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], -			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); +		long c0, c1; + +		c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; +		c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; +		page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);  	}  	sprintf(page, "\n");  } @@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)  }  static struct rcu_torture_ops srcu_ops = { +	.ttype		= SRCU_FLAVOR,  	.init		= rcu_sync_torture_init,  	.readlock	= srcu_torture_read_lock,  	.read_delay	= srcu_read_delay, @@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)  }  static struct rcu_torture_ops sched_ops = { +	.ttype		= RCU_SCHED_FLAVOR,  	.init		= rcu_sync_torture_init,  	.readlock	= sched_torture_read_lock,  	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */ @@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg)  static int  rcu_torture_writer(void *arg)  { -	bool exp; +	unsigned long gp_snap; +	bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; +	bool gp_sync1 = gp_sync;  	int i;  	struct rcu_torture *rp; -	struct rcu_torture *rp1;  	struct rcu_torture *old_rp;  	static DEFINE_TORTURE_RANDOM(rand); +	int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, +			   RTWS_COND_GET, RTWS_SYNC }; +	int nsynctypes = 0;  	VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); -	set_user_nice(current, MAX_NICE); + +	/* Initialize synctype[] array.  If none set, take default. */ +	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) +		gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; +	if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) +		synctype[nsynctypes++] = RTWS_COND_GET; +	else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) +		pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); +	if (gp_exp1 && cur_ops->exp_sync) +		synctype[nsynctypes++] = RTWS_EXP_SYNC; +	else if (gp_exp && !cur_ops->exp_sync) +		pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); +	if (gp_normal1 && cur_ops->deferred_free) +		synctype[nsynctypes++] = RTWS_DEF_FREE; +	else if (gp_normal && !cur_ops->deferred_free) +		pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); +	if (gp_sync1 && cur_ops->sync) +		synctype[nsynctypes++] = RTWS_SYNC; +	else if (gp_sync && !cur_ops->sync) +		pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); +	if (WARN_ONCE(nsynctypes == 0, +		      "rcu_torture_writer: No update-side primitives.\n")) { +		/* +		 * No updates primitives, so don't try updating. +		 * The resulting test won't be testing much, hence the +		 * above WARN_ONCE(). +		 */ +		rcu_torture_writer_state = RTWS_STOPPING; +		torture_kthread_stopping("rcu_torture_writer"); +	}  	do { +		rcu_torture_writer_state = RTWS_FIXED_DELAY;  		schedule_timeout_uninterruptible(1);  		rp = rcu_torture_alloc();  		if (rp == NULL)  			continue;  		rp->rtort_pipe_count = 0; +		rcu_torture_writer_state = RTWS_DELAY;  		udelay(torture_random(&rand) & 0x3ff); +		rcu_torture_writer_state = RTWS_REPLACE;  		old_rp = rcu_dereference_check(rcu_torture_current,  					       current == writer_task);  		rp->rtort_mbtest = 1; @@ -716,35 +810,42 @@ rcu_torture_writer(void *arg)  				i = RCU_TORTURE_PIPE_LEN;  			atomic_inc(&rcu_torture_wcount[i]);  			old_rp->rtort_pipe_count++; -			if (gp_normal == gp_exp) -				exp = !!(torture_random(&rand) & 0x80); -			else -				exp = gp_exp; -			if (!exp) { +			switch (synctype[torture_random(&rand) % nsynctypes]) { +			case RTWS_DEF_FREE: +				rcu_torture_writer_state = RTWS_DEF_FREE;  				cur_ops->deferred_free(old_rp); -			} else { +				break; +			case RTWS_EXP_SYNC: +				rcu_torture_writer_state = RTWS_EXP_SYNC;  				cur_ops->exp_sync(); -				list_add(&old_rp->rtort_free, -					 &rcu_torture_removed); -				list_for_each_entry_safe(rp, rp1, -							 &rcu_torture_removed, -							 rtort_free) { -					i = rp->rtort_pipe_count; -					if (i > RCU_TORTURE_PIPE_LEN) -						i = RCU_TORTURE_PIPE_LEN; -					atomic_inc(&rcu_torture_wcount[i]); -					if (++rp->rtort_pipe_count >= -					    RCU_TORTURE_PIPE_LEN) { -						rp->rtort_mbtest = 0; -						list_del(&rp->rtort_free); -						rcu_torture_free(rp); -					} -				 } +				rcu_torture_pipe_update(old_rp); +				break; +			case RTWS_COND_GET: +				rcu_torture_writer_state = RTWS_COND_GET; +				gp_snap = cur_ops->get_state(); +				i = torture_random(&rand) % 16; +				if (i != 0) +					schedule_timeout_interruptible(i); +				udelay(torture_random(&rand) % 1000); +				rcu_torture_writer_state = RTWS_COND_SYNC; +				cur_ops->cond_sync(gp_snap); +				rcu_torture_pipe_update(old_rp); +				break; +			case RTWS_SYNC: +				rcu_torture_writer_state = RTWS_SYNC; +				cur_ops->sync(); +				rcu_torture_pipe_update(old_rp); +				break; +			default: +				WARN_ON_ONCE(1); +				break;  			}  		}  		rcutorture_record_progress(++rcu_torture_current_version); +		rcu_torture_writer_state = RTWS_STUTTER;  		stutter_wait("rcu_torture_writer");  	} while (!torture_must_stop()); +	rcu_torture_writer_state = RTWS_STOPPING;  	torture_kthread_stopping("rcu_torture_writer");  	return 0;  } @@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg)  	return 0;  } -void rcutorture_trace_dump(void) +static void rcutorture_trace_dump(void)  {  	static atomic_t beenhere = ATOMIC_INIT(0); @@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg)  		__this_cpu_inc(rcu_torture_batch[completed]);  		preempt_enable();  		cur_ops->readunlock(idx); -		schedule(); +		cond_resched();  		stutter_wait("rcu_torture_reader");  	} while (!torture_must_stop()); -	if (irqreader && cur_ops->irq_capable) +	if (irqreader && cur_ops->irq_capable) {  		del_timer_sync(&t); +		destroy_timer_on_stack(&t); +	}  	torture_kthread_stopping("rcu_torture_reader");  	return 0;  } @@ -937,6 +1040,7 @@ rcu_torture_printk(char *page)  	int i;  	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };  	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; +	static unsigned long rtcv_snap = ULONG_MAX;  	for_each_possible_cpu(cpu) {  		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -997,6 +1101,22 @@ rcu_torture_printk(char *page)  	page += sprintf(page, "\n");  	if (cur_ops->stats)  		cur_ops->stats(page); +	if (rtcv_snap == rcu_torture_current_version && +	    rcu_torture_current != NULL) { +		int __maybe_unused flags; +		unsigned long __maybe_unused gpnum; +		unsigned long __maybe_unused completed; + +		rcutorture_get_gp_data(cur_ops->ttype, +				       &flags, &gpnum, &completed); +		page += sprintf(page, +				"??? Writer stall state %d g%lu c%lu f%#x\n", +				rcu_torture_writer_state, +				gpnum, completed, flags); +		show_rcu_gp_kthreads(); +		rcutorture_trace_dump(); +	} +	rtcv_snap = rcu_torture_current_version;  }  /* @@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void)  }  /* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) +static void rcu_torture_barrier_cbf(struct rcu_head *rcu)  {  	atomic_inc(&barrier_cbs_invoked);  } @@ -1416,7 +1536,8 @@ rcu_torture_init(void)  		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,  	}; -	torture_init_begin(torture_type, verbose, &rcutorture_runnable); +	if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) +		return -EBUSY;  	/* Process args and tell the world that the torturer is on the job. */  	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1441,10 +1562,13 @@ rcu_torture_init(void)  	if (cur_ops->init)  		cur_ops->init(); /* no "goto unwind" prior to this point!!! */ -	if (nreaders >= 0) +	if (nreaders >= 0) {  		nrealreaders = nreaders; -	else -		nrealreaders = 2 * num_online_cpus(); +	} else { +		nrealreaders = num_online_cpus() - 1; +		if (nrealreaders <= 0) +			nrealreaders = 1; +	}  	rcu_torture_print_module_parms(cur_ops, "Start of test");  	/* Set up the freelist. */ @@ -1533,7 +1657,8 @@ rcu_torture_init(void)  		fqs_duration = 0;  	if (fqs_duration) {  		/* Create the fqs thread */ -		torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); +		firsterr = torture_create_kthread(rcu_torture_fqs, NULL, +						  fqs_task);  		if (firsterr)  			goto unwind;  	} diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 431528520562..858c56569127 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)  		return;  	rcp->ticks_this_gp++;  	j = jiffies; -	js = rcp->jiffies_stall; +	js = ACCESS_ONCE(rcp->jiffies_stall);  	if (*rcp->curtail && ULONG_CMP_GE(j, js)) {  		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",  		       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, @@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)  		dump_stack();  	}  	if (*rcp->curtail && ULONG_CMP_GE(j, js)) -		rcp->jiffies_stall = jiffies + +		ACCESS_ONCE(rcp->jiffies_stall) = jiffies +  			3 * rcu_jiffies_till_stall_check() + 3;  	else if (ULONG_CMP_GE(j, js)) -		rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +		ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();  }  static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)  {  	rcp->ticks_this_gp = 0;  	rcp->gp_start = jiffies; -	rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +	ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();  }  static void check_cpu_stalls(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 88b4a1dcb58c..f1ba77363fbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)  RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);  RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -static struct rcu_state *rcu_state; +static struct rcu_state *rcu_state_p;  LIST_HEAD(rcu_struct_flavors);  /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ @@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;  module_param(jiffies_till_first_fqs, ulong, 0644);  module_param(jiffies_till_next_fqs, ulong, 0644); -static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,  				  struct rcu_data *rdp);  static void force_qs_rnp(struct rcu_state *rsp,  			 int (*f)(struct rcu_data *rsp, bool *isidle, @@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void)  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);  /* + * Force a quiescent state. + */ +void rcu_force_quiescent_state(void) +{ +	force_quiescent_state(rcu_state_p); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/*   * Force a quiescent state for RCU BH.   */  void rcu_bh_force_quiescent_state(void) @@ -280,6 +289,21 @@ void rcu_bh_force_quiescent_state(void)  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);  /* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) { +		pr_info("%s: wait state: %d ->state: %#lx\n", +			rsp->name, rsp->gp_state, rsp->gp_kthread->state); +		/* sched_show_task(rsp->gp_kthread); */ +	} +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/*   * Record the number of times rcutorture tests have been initiated and   * terminated.  This information allows the debugfs tracing stats to be   * correlated to the rcutorture messages, even when the rcutorture module @@ -294,6 +318,39 @@ void rcutorture_record_test_transition(void)  EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);  /* + * Send along grace-period-related data for rcutorture diagnostics. + */ +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, +			    unsigned long *gpnum, unsigned long *completed) +{ +	struct rcu_state *rsp = NULL; + +	switch (test_type) { +	case RCU_FLAVOR: +		rsp = rcu_state_p; +		break; +	case RCU_BH_FLAVOR: +		rsp = &rcu_bh_state; +		break; +	case RCU_SCHED_FLAVOR: +		rsp = &rcu_sched_state; +		break; +	default: +		break; +	} +	if (rsp != NULL) { +		*flags = ACCESS_ONCE(rsp->gp_flags); +		*gpnum = ACCESS_ONCE(rsp->gpnum); +		*completed = ACCESS_ONCE(rsp->completed); +		return; +	} +	*flags = 0; +	*gpnum = 0; +	*completed = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); + +/*   * Record the number of writer passes through the current rcutorture test.   * This is also used to correlate debugfs tracing stats with the rcutorture   * messages. @@ -324,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)  }  /* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ +	return &rsp->node[0]; +} + +/* + * Is there any need for future grace periods? + * Interrupts must be disabled.  If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_future_needs_gp(struct rcu_state *rsp) +{ +	struct rcu_node *rnp = rcu_get_root(rsp); +	int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; +	int *fp = &rnp->need_future_gp[idx]; + +	return ACCESS_ONCE(*fp); +} + +/*   * Does the current CPU require a not-yet-started grace period?   * The caller must have disabled interrupts to prevent races with   * normal callback registry. @@ -335,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)  	if (rcu_gp_in_progress(rsp))  		return 0;  /* No, a grace period is already in progress. */ -	if (rcu_nocb_needs_gp(rsp)) +	if (rcu_future_needs_gp(rsp))  		return 1;  /* Yes, a no-CBs CPU needs one. */  	if (!rdp->nxttail[RCU_NEXT_TAIL])  		return 0;  /* No, this is a no-CBs (or offline) CPU. */ @@ -350,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)  }  /* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ -	return &rsp->node[0]; -} - -/*   * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state   *   * If the new value of the ->dynticks_nesting counter now is zero, @@ -758,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,  {  	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);  	rcu_sysidle_check_cpu(rdp, isidle, maxj); -	return (rdp->dynticks_snap & 0x1) == 0; +	if ((rdp->dynticks_snap & 0x1) == 0) { +		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); +		return 1; +	} else { +		return 0; +	}  }  /* @@ -834,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,  	 * we will beat on the first one until it gets unstuck, then move  	 * to the next.  Only do this for the primary flavor of RCU.  	 */ -	if (rdp->rsp == rcu_state && +	if (rdp->rsp == rcu_state_p &&  	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {  		rdp->rsp->jiffies_resched += 5;  		resched_cpu(rdp->cpu); @@ -851,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)  	rsp->gp_start = j;  	smp_wmb(); /* Record start time before stall time. */  	j1 = rcu_jiffies_till_stall_check(); -	rsp->jiffies_stall = j + j1; +	ACCESS_ONCE(rsp->jiffies_stall) = j + j1;  	rsp->jiffies_resched = j + j1 / 2;  } @@ -890,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	/* Only let one CPU complain about others per time interval. */  	raw_spin_lock_irqsave(&rnp->lock, flags); -	delta = jiffies - rsp->jiffies_stall; +	delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);  	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  		return;  	} -	rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; +	ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	/* @@ -932,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	print_cpu_stall_info_end();  	for_each_possible_cpu(cpu)  		totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; -	pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", +	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",  	       smp_processor_id(), (long)(jiffies - rsp->gp_start), -	       rsp->gpnum, rsp->completed, totqlen); +	       (long)rsp->gpnum, (long)rsp->completed, totqlen);  	if (ndetected == 0)  		pr_err("INFO: Stall ended before state dump start\n");  	else if (!trigger_all_cpu_backtrace()) @@ -947,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	force_quiescent_state(rsp);  /* Kick them all. */  } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); -  static void print_cpu_stall(struct rcu_state *rsp)  {  	int cpu; @@ -971,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp)  	print_cpu_stall_info_end();  	for_each_possible_cpu(cpu)  		totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; -	pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", -		jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); +	pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", +		jiffies - rsp->gp_start, +		(long)rsp->gpnum, (long)rsp->completed, totqlen);  	if (!trigger_all_cpu_backtrace())  		dump_stack();  	raw_spin_lock_irqsave(&rnp->lock, flags); -	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) -		rsp->jiffies_stall = jiffies + +	if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) +		ACCESS_ONCE(rsp->jiffies_stall) = jiffies +  				     3 * rcu_jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1062,7 +1133,7 @@ void rcu_cpu_stall_reset(void)  	struct rcu_state *rsp;  	for_each_rcu_flavor(rsp) -		rsp->jiffies_stall = jiffies + ULONG_MAX / 2; +		ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;  }  /* @@ -1123,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,  /*   * Start some future grace period, as needed to handle newly arrived   * callbacks.  The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. + * rcu_node structure's ->need_future_gp field.  Returns true if there + * is reason to awaken the grace-period kthread.   *   * The caller must hold the specified rcu_node structure's ->lock.   */ -static unsigned long __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +static bool __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, +		    unsigned long *c_out)  {  	unsigned long c;  	int i; +	bool ret = false;  	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);  	/* @@ -1142,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)  	trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));  	if (rnp->need_future_gp[c & 0x1]) {  		trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); -		return c; +		goto out;  	}  	/* @@ -1156,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)  	    ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {  		rnp->need_future_gp[c & 0x1]++;  		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); -		return c; +		goto out;  	}  	/* @@ -1197,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)  		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));  	} else {  		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); -		rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); +		ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);  	}  unlock_out:  	if (rnp != rnp_root)  		raw_spin_unlock(&rnp_root->lock); -	return c; +out: +	if (c_out != NULL) +		*c_out = c; +	return ret;  }  /* @@ -1226,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)  }  /* + * Awaken the grace-period kthread for the specified flavor of RCU. + * Don't do a self-awaken, and don't bother awakening when there is + * nothing for the grace-period kthread to do (as in several CPUs + * raced to awaken, and we lost), and finally don't try to awaken + * a kthread that has not yet been created. + */ +static void rcu_gp_kthread_wake(struct rcu_state *rsp) +{ +	if (current == rsp->gp_kthread || +	    !ACCESS_ONCE(rsp->gp_flags) || +	    !rsp->gp_kthread) +		return; +	wake_up(&rsp->gp_wq); +} + +/*   * If there is room, assign a ->completed number to any callbacks on   * this CPU that have not already been assigned.  Also accelerate any   * callbacks that were previously assigned a ->completed number that has   * since proven to be too conservative, which can happen if callbacks get   * assigned a ->completed number while RCU is idle, but with reference to   * a non-root rcu_node structure.  This function is idempotent, so it does - * not hurt to call it repeatedly. + * not hurt to call it repeatedly.  Returns an flag saying that we should + * awaken the RCU grace-period kthread.   *   * The caller must hold rnp->lock with interrupts disabled.   */ -static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  			       struct rcu_data *rdp)  {  	unsigned long c;  	int i; +	bool ret;  	/* If the CPU has no callbacks, nothing to do. */  	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) -		return; +		return false;  	/*  	 * Starting from the sublist containing the callbacks most @@ -1273,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  	 * be grouped into.  	 */  	if (++i >= RCU_NEXT_TAIL) -		return; +		return false;  	/*  	 * Assign all subsequent callbacks' ->completed number to the next @@ -1285,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  		rdp->nxtcompleted[i] = c;  	}  	/* Record any needed additional grace periods. */ -	rcu_start_future_gp(rnp, rdp); +	ret = rcu_start_future_gp(rnp, rdp, NULL);  	/* Trace depending on how much we were able to accelerate. */  	if (!*rdp->nxttail[RCU_WAIT_TAIL])  		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));  	else  		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); +	return ret;  }  /* @@ -1300,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,   * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL   * sublist.  This function is idempotent, so it does not hurt to   * invoke it repeatedly.  As long as it is not invoked -too- often... + * Returns true if the RCU grace-period kthread needs to be awakened.   *   * The caller must hold rnp->lock with interrupts disabled.   */ -static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  			    struct rcu_data *rdp)  {  	int i, j;  	/* If the CPU has no callbacks, nothing to do. */  	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) -		return; +		return false;  	/*  	 * Find all callbacks whose ->completed numbers indicate that they @@ -1334,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,  	}  	/* Classify any remaining callbacks. */ -	rcu_accelerate_cbs(rsp, rnp, rdp); +	return rcu_accelerate_cbs(rsp, rnp, rdp);  }  /*   * Update CPU-local rcu_data state to record the beginnings and ends of   * grace periods.  The caller must hold the ->lock of the leaf rcu_node   * structure corresponding to the current CPU, and must have irqs disabled. + * Returns true if the grace-period kthread needs to be awakened.   */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, +			      struct rcu_data *rdp)  { +	bool ret; +  	/* Handle the ends of any preceding grace periods first. */  	if (rdp->completed == rnp->completed) {  		/* No grace period end, so just accelerate recent callbacks. */ -		rcu_accelerate_cbs(rsp, rnp, rdp); +		ret = rcu_accelerate_cbs(rsp, rnp, rdp);  	} else {  		/* Advance callbacks. */ -		rcu_advance_cbs(rsp, rnp, rdp); +		ret = rcu_advance_cbs(rsp, rnp, rdp);  		/* Remember that we saw this grace-period completion. */  		rdp->completed = rnp->completed; @@ -1372,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc  		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);  		zero_cpu_stall_ticks(rdp);  	} +	return ret;  }  static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags; +	bool needwake;  	struct rcu_node *rnp;  	local_irq_save(flags); @@ -1388,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)  		return;  	}  	smp_mb__after_unlock_lock(); -	__note_gp_changes(rsp, rnp, rdp); +	needwake = __note_gp_changes(rsp, rnp, rdp);  	raw_spin_unlock_irqrestore(&rnp->lock, flags); +	if (needwake) +		rcu_gp_kthread_wake(rsp);  }  /* @@ -1403,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp)  	rcu_bind_gp_kthread();  	raw_spin_lock_irq(&rnp->lock);  	smp_mb__after_unlock_lock(); -	if (rsp->gp_flags == 0) { +	if (!ACCESS_ONCE(rsp->gp_flags)) {  		/* Spurious wakeup, tell caller to go back to sleep.  */  		raw_spin_unlock_irq(&rnp->lock);  		return 0;  	} -	rsp->gp_flags = 0; /* Clear all flags: New grace period. */ +	ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */  	if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {  		/* @@ -1453,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  		WARN_ON_ONCE(rnp->completed != rsp->completed);  		ACCESS_ONCE(rnp->completed) = rsp->completed;  		if (rnp == rdp->mynode) -			__note_gp_changes(rsp, rnp, rdp); +			(void)__note_gp_changes(rsp, rnp, rdp);  		rcu_preempt_boost_start_gp(rnp);  		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,  					    rnp->level, rnp->grplo, @@ -1501,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)  	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {  		raw_spin_lock_irq(&rnp->lock);  		smp_mb__after_unlock_lock(); -		rsp->gp_flags &= ~RCU_GP_FLAG_FQS; +		ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;  		raw_spin_unlock_irq(&rnp->lock);  	}  	return fqs_state; @@ -1513,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)  static void rcu_gp_cleanup(struct rcu_state *rsp)  {  	unsigned long gp_duration; +	bool needgp = false;  	int nocb = 0;  	struct rcu_data *rdp;  	struct rcu_node *rnp = rcu_get_root(rsp); @@ -1548,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)  		ACCESS_ONCE(rnp->completed) = rsp->gpnum;  		rdp = this_cpu_ptr(rsp->rda);  		if (rnp == rdp->mynode) -			__note_gp_changes(rsp, rnp, rdp); +			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;  		/* smp_mb() provided by prior unlock-lock pair. */  		nocb += rcu_future_gp_cleanup(rsp, rnp);  		raw_spin_unlock_irq(&rnp->lock); @@ -1564,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)  	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));  	rsp->fqs_state = RCU_GP_IDLE;  	rdp = this_cpu_ptr(rsp->rda); -	rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */ -	if (cpu_needs_another_gp(rsp, rdp)) { -		rsp->gp_flags = RCU_GP_FLAG_INIT; +	/* Advance CBs to reduce false positives below. */ +	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; +	if (needgp || cpu_needs_another_gp(rsp, rdp)) { +		ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;  		trace_rcu_grace_period(rsp->name,  				       ACCESS_ONCE(rsp->gpnum),  				       TPS("newreq")); @@ -1593,6 +1700,7 @@ static int __noreturn rcu_gp_kthread(void *arg)  			trace_rcu_grace_period(rsp->name,  					       ACCESS_ONCE(rsp->gpnum),  					       TPS("reqwait")); +			rsp->gp_state = RCU_GP_WAIT_GPS;  			wait_event_interruptible(rsp->gp_wq,  						 ACCESS_ONCE(rsp->gp_flags) &  						 RCU_GP_FLAG_INIT); @@ -1620,6 +1728,7 @@ static int __noreturn rcu_gp_kthread(void *arg)  			trace_rcu_grace_period(rsp->name,  					       ACCESS_ONCE(rsp->gpnum),  					       TPS("fqswait")); +			rsp->gp_state = RCU_GP_WAIT_FQS;  			ret = wait_event_interruptible_timeout(rsp->gp_wq,  					((gf = ACCESS_ONCE(rsp->gp_flags)) &  					 RCU_GP_FLAG_FQS) || @@ -1665,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg)  	}  } -static void rsp_wakeup(struct irq_work *work) -{ -	struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); - -	/* Wake up rcu_gp_kthread() to start the grace period. */ -	wake_up(&rsp->gp_wq); -} -  /*   * Start a new RCU grace period if warranted, re-initializing the hierarchy   * in preparation for detecting the next grace period.  The caller must hold @@ -1681,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work)   * Note that it is legal for a dying CPU (which is marked as offline) to   * invoke this function.  This can happen when the dying CPU reports its   * quiescent state. + * + * Returns true if the grace-period kthread must be awakened.   */ -static void +static bool  rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,  		      struct rcu_data *rdp)  { @@ -1693,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,  		 * or a grace period is already in progress.  		 * Either way, don't start a new grace period.  		 */ -		return; +		return false;  	} -	rsp->gp_flags = RCU_GP_FLAG_INIT; +	ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;  	trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),  			       TPS("newreq"));  	/*  	 * We can't do wakeups while holding the rnp->lock, as that  	 * could cause possible deadlocks with the rq->lock. Defer -	 * the wakeup to interrupt context.  And don't bother waking -	 * up the running kthread. +	 * the wakeup to our caller.  	 */ -	if (current != rsp->gp_kthread) -		irq_work_queue(&rsp->wakeup_work); +	return true;  }  /* @@ -1715,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,   * is invoked indirectly from rcu_advance_cbs(), which would result in   * endless recursion -- or would do so if it wasn't for the self-deadlock   * that is encountered beforehand. + * + * Returns true if the grace-period kthread needs to be awakened.   */ -static void -rcu_start_gp(struct rcu_state *rsp) +static bool rcu_start_gp(struct rcu_state *rsp)  {  	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);  	struct rcu_node *rnp = rcu_get_root(rsp); +	bool ret = false;  	/*  	 * If there is no grace period in progress right now, any @@ -1730,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp)  	 * resulting in pointless grace periods.  So, advance callbacks  	 * then start the grace period!  	 */ -	rcu_advance_cbs(rsp, rnp, rdp); -	rcu_start_gp_advanced(rsp, rnp, rdp); +	ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; +	ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; +	return ret;  }  /* @@ -1820,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags;  	unsigned long mask; +	bool needwake;  	struct rcu_node *rnp;  	rnp = rdp->mynode; @@ -1848,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)  		 * This GP can't end until cpu checks in, so all of our  		 * callbacks can be processed during the next GP.  		 */ -		rcu_accelerate_cbs(rsp, rnp, rdp); +		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);  		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ +		if (needwake) +			rcu_gp_kthread_wake(rsp);  	}  } @@ -1951,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,  static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)  {  	int i; -	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); +	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);  	/* No-CBs CPUs are handled specially. */  	if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2320,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);  		return;  /* Someone beat us to it. */  	} -	rsp->gp_flags |= RCU_GP_FLAG_FQS; +	ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;  	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);  	wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */  } @@ -2334,7 +2441,8 @@ static void  __rcu_process_callbacks(struct rcu_state *rsp)  {  	unsigned long flags; -	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); +	bool needwake; +	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);  	WARN_ON_ONCE(rdp->beenonline == 0); @@ -2345,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)  	local_irq_save(flags);  	if (cpu_needs_another_gp(rsp, rdp)) {  		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ -		rcu_start_gp(rsp); +		needwake = rcu_start_gp(rsp);  		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); +		if (needwake) +			rcu_gp_kthread_wake(rsp);  	} else {  		local_irq_restore(flags);  	} @@ -2404,6 +2514,8 @@ static void invoke_rcu_core(void)  static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,  			    struct rcu_head *head, unsigned long flags)  { +	bool needwake; +  	/*  	 * If called from an extended quiescent state, invoke the RCU  	 * core in order to force a re-evaluation of RCU's idleness. @@ -2433,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,  			raw_spin_lock(&rnp_root->lock);  			smp_mb__after_unlock_lock(); -			rcu_start_gp(rsp); +			needwake = rcu_start_gp(rsp);  			raw_spin_unlock(&rnp_root->lock); +			if (needwake) +				rcu_gp_kthread_wake(rsp);  		} else {  			/* Give the grace period a kick. */  			rdp->blimit = LONG_MAX; @@ -2537,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  EXPORT_SYMBOL_GPL(call_rcu_bh);  /* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, +		    void (*func)(struct rcu_head *rcu)) +{ +	__call_rcu(head, func, rcu_state_p, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/*   * Because a context switch is a grace period for RCU-sched and RCU-bh,   * any blocking grace-period wait automatically implies a grace period   * if there is only one CPU online at any point time during execution @@ -2659,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void)  	 * time-consuming work between get_state_synchronize_rcu()  	 * and cond_synchronize_rcu().  	 */ -	return smp_load_acquire(&rcu_state->gpnum); +	return smp_load_acquire(&rcu_state_p->gpnum);  }  EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -2685,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate)  	 * Ensure that this load happens before any RCU-destructive  	 * actions the caller might carry out after we return.  	 */ -	newstate = smp_load_acquire(&rcu_state->completed); +	newstate = smp_load_acquire(&rcu_state_p->completed);  	if (ULONG_CMP_GE(oldstate, newstate))  		synchronize_rcu();  } @@ -2988,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)  static void rcu_barrier_func(void *type)  {  	struct rcu_state *rsp = type; -	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); +	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);  	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);  	atomic_inc(&rsp->barrier_cpu_count); @@ -3160,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)   * that this CPU cannot possibly have any RCU callbacks in flight yet.   */  static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp)  {  	unsigned long flags;  	unsigned long mask; @@ -3173,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	/* Set up local state, ensuring consistent view of global state. */  	raw_spin_lock_irqsave(&rnp->lock, flags);  	rdp->beenonline = 1;	 /* We have now been online. */ -	rdp->preemptible = preemptible;  	rdp->qlen_last_fqs_check = 0;  	rdp->n_force_qs_snap = rsp->n_force_qs;  	rdp->blimit = blimit; @@ -3217,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu)  	struct rcu_state *rsp;  	for_each_rcu_flavor(rsp) -		rcu_init_percpu_data(cpu, rsp, -				     strcmp(rsp->name, "rcu_preempt") == 0); +		rcu_init_percpu_data(cpu, rsp);  }  /* @@ -3228,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self,  				    unsigned long action, void *hcpu)  {  	long cpu = (long)hcpu; -	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); +	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);  	struct rcu_node *rnp = rdp->mynode;  	struct rcu_state *rsp; @@ -3402,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,  			rnp->qsmaskinit = 0;  			rnp->grplo = j * cpustride;  			rnp->grphi = (j + 1) * cpustride - 1; -			if (rnp->grphi >= NR_CPUS) -				rnp->grphi = NR_CPUS - 1; +			if (rnp->grphi >= nr_cpu_ids) +				rnp->grphi = nr_cpu_ids - 1;  			if (i == 0) {  				rnp->grpnum = 0;  				rnp->grpmask = 0; @@ -3422,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,  	rsp->rda = rda;  	init_waitqueue_head(&rsp->gp_wq); -	init_irq_work(&rsp->wakeup_work, rsp_wakeup);  	rnp = rsp->level[rcu_num_lvls - 1];  	for_each_possible_cpu(i) {  		while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..bf2c1e669691 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -252,7 +252,6 @@ struct rcu_data {  	bool		passed_quiesce;	/* User-mode/idle loop etc. */  	bool		qs_pending;	/* Core waits for quiesc state. */  	bool		beenonline;	/* CPU online at least once. */ -	bool		preemptible;	/* Preemptible RCU? */  	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */  	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */  #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -406,7 +405,8 @@ struct rcu_state {  	unsigned long completed;		/* # of last completed gp. */  	struct task_struct *gp_kthread;		/* Task for grace periods. */  	wait_queue_head_t gp_wq;		/* Where GP task waits. */ -	int gp_flags;				/* Commands for GP task. */ +	short gp_flags;				/* Commands for GP task. */ +	short gp_state;				/* GP kthread sleep state. */  	/* End of fields guarded by root rcu_node's lock. */ @@ -462,13 +462,17 @@ struct rcu_state {  	const char *name;			/* Name of structure. */  	char abbr;				/* Abbreviated name. */  	struct list_head flavors;		/* List of RCU flavors. */ -	struct irq_work wakeup_work;		/* Postponed wakeups */  };  /* Values for rcu_state structure's gp_flags field. */  #define RCU_GP_FLAG_INIT 0x1	/* Need grace-period initialization. */  #define RCU_GP_FLAG_FQS  0x2	/* Need grace-period quiescent-state forcing. */ +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_WAIT_INIT 0	/* Initial state. */ +#define RCU_GP_WAIT_GPS  1	/* Wait for grace-period start. */ +#define RCU_GP_WAIT_FQS  2	/* Wait for force-quiescent-state time. */ +  extern struct list_head rcu_struct_flavors;  /* Sequence through rcu_state structures for each RCU flavor. */ @@ -547,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);  static void print_cpu_stall_info_end(void);  static void zero_cpu_stall_ticks(struct rcu_data *rdp);  static void increment_cpu_stall_ticks(void); -static int rcu_nocb_needs_gp(struct rcu_state *rsp);  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);  static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);  static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 56db2f853e43..cbc2c45265e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void)  #ifdef CONFIG_TREE_PREEMPT_RCU  RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state = &rcu_preempt_state; +static struct rcu_state *rcu_state_p = &rcu_preempt_state;  static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -149,15 +149,6 @@ long rcu_batches_completed(void)  EXPORT_SYMBOL_GPL(rcu_batches_completed);  /* - * Force a quiescent state for preemptible RCU. - */ -void rcu_force_quiescent_state(void) -{ -	force_quiescent_state(&rcu_preempt_state); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/*   * Record a preemptible-RCU quiescent state for the specified CPU.  Note   * that this just means that the task currently running on the CPU is   * not in a quiescent state.  There might be any number of tasks blocked @@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  }  EXPORT_SYMBOL_GPL(call_rcu); -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks.  Until then, this - * function may only be called from __kfree_rcu(). - */ -void kfree_call_rcu(struct rcu_head *head, -		    void (*func)(struct rcu_head *rcu)) -{ -	__call_rcu(head, func, &rcu_preempt_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); -  /**   * synchronize_rcu - wait until a grace period has elapsed.   * @@ -970,7 +947,7 @@ void exit_rcu(void)  #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static struct rcu_state *rcu_state = &rcu_sched_state; +static struct rcu_state *rcu_state_p = &rcu_sched_state;  /*   * Tell them what RCU they are running. @@ -991,16 +968,6 @@ long rcu_batches_completed(void)  EXPORT_SYMBOL_GPL(rcu_batches_completed);  /* - * Force a quiescent state for RCU, which, because there is no preemptible - * RCU, becomes the same as rcu-sched. - */ -void rcu_force_quiescent_state(void) -{ -	rcu_sched_force_quiescent_state(); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/*   * Because preemptible RCU does not exist, we never have to check for   * CPUs being in quiescent states.   */ @@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)  }  /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks.  Until then, this - * function may only be called from __kfree_rcu(). - * - * Because there is no preemptible RCU, we use RCU-sched instead. - */ -void kfree_call_rcu(struct rcu_head *head, -		    void (*func)(struct rcu_head *rcu)) -{ -	__call_rcu(head, func, &rcu_sched_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - -/*   * Wait for an rcu-preempt grace period, but make it happen quickly.   * But because preemptible RCU does not exist, map to rcu-sched.   */ @@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void)  	for_each_possible_cpu(cpu)  		per_cpu(rcu_cpu_has_work, cpu) = 0;  	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); -	rnp = rcu_get_root(rcu_state); -	(void)rcu_spawn_one_boost_kthread(rcu_state, rnp); +	rnp = rcu_get_root(rcu_state_p); +	(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);  	if (NUM_RCU_NODES > 1) { -		rcu_for_each_leaf_node(rcu_state, rnp) -			(void)rcu_spawn_one_boost_kthread(rcu_state, rnp); +		rcu_for_each_leaf_node(rcu_state_p, rnp) +			(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);  	}  	return 0;  } @@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads);  static void rcu_prepare_kthreads(int cpu)  { -	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); +	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);  	struct rcu_node *rnp = rdp->mynode;  	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */  	if (rcu_scheduler_fully_active) -		(void)rcu_spawn_one_boost_kthread(rcu_state, rnp); +		(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);  }  #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)  static void rcu_prepare_for_idle(int cpu)  {  #ifndef CONFIG_RCU_NOCB_CPU_ALL +	bool needwake;  	struct rcu_data *rdp;  	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);  	struct rcu_node *rnp; @@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)  		rnp = rdp->mynode;  		raw_spin_lock(&rnp->lock); /* irqs already disabled. */  		smp_mb__after_unlock_lock(); -		rcu_accelerate_cbs(rsp, rnp, rdp); +		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);  		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ +		if (needwake) +			rcu_gp_kthread_wake(rsp);  	}  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */  } @@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)  	struct rcu_data *rdp;  	for_each_rcu_flavor(rsp) { -		rdp = __this_cpu_ptr(rsp->rda); +		rdp = raw_cpu_ptr(rsp->rda);  		if (rdp->qlen_lazy != 0) {  			atomic_inc(&oom_callback_count);  			rsp->call(&rdp->oom_head, rcu_oom_callback); @@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)  	struct rcu_state *rsp;  	for_each_rcu_flavor(rsp) -		__this_cpu_ptr(rsp->rda)->ticks_this_gp++; +		raw_cpu_inc(rsp->rda->ticks_this_gp);  }  #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)  early_param("rcu_nocb_poll", parse_rcu_nocb_poll);  /* - * Do any no-CBs CPUs need another grace period? - * - * Interrupts must be disabled.  If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ -	struct rcu_node *rnp = rcu_get_root(rsp); - -	return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; -} - -/*   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended   * grace period.   */ @@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)  }  #ifndef CONFIG_RCU_NOCB_CPU_ALL -/* Is the specified CPU a no-CPUs CPU? */ +/* Is the specified CPU a no-CBs CPU? */  bool rcu_is_nocb_cpu(int cpu)  {  	if (have_rcu_nocb_mask) @@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)  	unsigned long c;  	bool d;  	unsigned long flags; +	bool needwake;  	struct rcu_node *rnp = rdp->mynode;  	raw_spin_lock_irqsave(&rnp->lock, flags);  	smp_mb__after_unlock_lock(); -	c = rcu_start_future_gp(rnp, rdp); +	needwake = rcu_start_future_gp(rnp, rdp, &c);  	raw_spin_unlock_irqrestore(&rnp->lock, flags); +	if (needwake) +		rcu_gp_kthread_wake(rdp->rsp);  	/*  	 * Wait for the grace period.  Do so interruptibly to avoid messing @@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)  #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ -	return 0; -} -  static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)  {  } @@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)  }  /* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. - */ -static void rcu_bind_gp_kthread(void) -{ -	int cpu = ACCESS_ONCE(tick_do_timer_cpu); - -	if (cpu < 0 || cpu >= nr_cpu_ids) -		return; -	if (raw_smp_processor_id() != cpu) -		set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/*   * Return a delay in jiffies based on the number of CPUs, rcu_node   * leaf fanout, and jiffies tick rate.  The idea is to allow larger   * systems more time to transition to full-idle state in order to @@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)  static void rcu_sysidle_cancel(void)  {  	smp_mb(); -	ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; +	if (full_sysidle_state > RCU_SYSIDLE_SHORT) +		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;  }  /* @@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)  	return false;  } -static void rcu_bind_gp_kthread(void) -{ -} -  static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,  				  unsigned long maxj)  { @@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)  #endif /* #ifdef CONFIG_NO_HZ_FULL */  	return 0;  } + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ +#ifdef CONFIG_NO_HZ_FULL +	int cpu = ACCESS_ONCE(tick_do_timer_cpu); + +	if (cpu < 0 || cpu >= nr_cpu_ids) +		return; +	if (raw_smp_processor_id() != cpu) +		set_cpus_allowed_ptr(current, cpumask_of(cpu)); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c0a9b0af469..a2aeb4df0f60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void)  	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;  } +void rcu_sysrq_start(void) +{ +	if (!rcu_cpu_stall_suppress) +		rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ +	if (rcu_cpu_stall_suppress == 2) +		rcu_cpu_stall_suppress = 0; +} +  static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)  {  	rcu_cpu_stall_suppress = 1; @@ -338,3 +350,21 @@ static int __init check_cpu_stall_init(void)  early_initcall(check_cpu_stall_init);  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +DEFINE_PER_CPU(int, rcu_cond_resched_count); + +/* + * Report a set of RCU quiescent states, for use by cond_resched() + * and friends.  Out of line due to being called infrequently. + */ +void rcu_resched(void) +{ +	preempt_disable(); +	__this_cpu_write(rcu_cond_resched_count, 0); +	rcu_note_context_switch(smp_processor_id()); +	preempt_enable(); +} diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)  			break;  		case 's': -			if (isdigit(*(str+1))) -				reboot_cpu = simple_strtoul(str+1, NULL, 0); -			else if (str[1] == 'm' && str[2] == 'p' && -							isdigit(*(str+3))) -				reboot_cpu = simple_strtoul(str+3, NULL, 0); -			else +		{ +			int rc; + +			if (isdigit(*(str+1))) { +				rc = kstrtoint(str+1, 0, &reboot_cpu); +				if (rc) +					return rc; +			} else if (str[1] == 'm' && str[2] == 'p' && +				   isdigit(*(str+3))) { +				rc = kstrtoint(str+3, 0, &reboot_cpu); +				if (rc) +					return rc; +			} else  				reboot_mode = REBOOT_SOFT;  			break; - +		}  		case 'g':  			reboot_mode = REBOOT_GPIO;  			break; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,  	/* return RES_COUNTER_MAX(unlimited) if "-1" is specified */  	if (*buf == '-') { -		res = simple_strtoull(buf + 1, &end, 10); -		if (res != 1 || *end != '\0') +		int rc = kstrtoull(buf + 1, 10, &res); + +		if (rc) +			return rc; +		if (res != 1)  			return -EINVAL;  		*resp = RES_COUNTER_MAX;  		return 0; diff --git a/kernel/resource.c b/kernel/resource.c index 8957d686e29b..3c2237ac32db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)  		if (p->flags & IORESOURCE_BUSY)  			continue; -		printk(KERN_WARNING "resource map sanity check conflict: " -		       "0x%llx 0x%llx 0x%llx 0x%llx %s\n", +		printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",  		       (unsigned long long)addr,  		       (unsigned long long)(addr + size - 1), -		       (unsigned long long)p->start, -		       (unsigned long long)p->end, -		       p->name); +		       p->name, p);  		err = -1;  		break;  	} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a70ec091760..c6b98793d647 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -522,6 +522,39 @@ static inline void init_hrtick(void)  #endif	/* CONFIG_SCHED_HRTICK */  /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val)						\ +({	typeof(*(ptr)) __old, __val = *(ptr);				\ + 	for (;;) {							\ + 		__old = cmpxchg((ptr), __val, __val | (val));		\ + 		if (__old == __val)					\ + 			break;						\ + 		__val = __old;						\ + 	}								\ + 	__old;								\ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	set_tsk_need_resched(p); +	return true; +} +#endif + +/*   * resched_task - mark a task 'to be rescheduled now'.   *   * On UP this means the setting of the need_resched flag, on SMP it @@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)  	if (test_tsk_need_resched(p))  		return; -	set_tsk_need_resched(p); -  	cpu = task_cpu(p); +  	if (cpu == smp_processor_id()) { +		set_tsk_need_resched(p);  		set_preempt_need_resched();  		return;  	} -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(p)) +	if (set_nr_and_not_polling(p))  		smp_send_reschedule(cpu);  } @@ -1336,7 +1367,7 @@ out:  		 * leave kernel.  		 */  		if (p->mm && printk_ratelimit()) { -			printk_sched("process %d (%s) no longer affine to cpu%d\n", +			printk_deferred("process %d (%s) no longer affine to cpu%d\n",  					task_pid_nr(p), p->comm, cpu);  		}  	} @@ -2208,7 +2239,7 @@ static inline void post_schedule(struct rq *rq)   * schedule_tail - first thing a freshly forked thread must call.   * @prev: the thread we just switched away from.   */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev)  	__releases(rq->lock)  {  	struct rq *rq = this_rq(); @@ -2608,8 +2639,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)  	if (likely(prev->sched_class == class &&  		   rq->nr_running == rq->cfs.h_nr_running)) {  		p = fair_sched_class.pick_next_task(rq, prev); -		if (likely(p && p != RETRY_TASK)) -			return p; +		if (unlikely(p == RETRY_TASK)) +			goto again; + +		/* assumes fair_sched_class->next == idle_sched_class */ +		if (unlikely(!p)) +			p = idle_sched_class.pick_next_task(rq, prev); + +		return p;  	}  again: @@ -2757,7 +2794,7 @@ static inline void sched_submit_work(struct task_struct *tsk)  		blk_schedule_flush_plug(tsk);  } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void)  {  	struct task_struct *tsk = current; @@ -2767,7 +2804,7 @@ asmlinkage void __sched schedule(void)  EXPORT_SYMBOL(schedule);  #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void)  {  	/*  	 * If we come here after a random call to set_need_resched(), @@ -2799,7 +2836,7 @@ void __sched schedule_preempt_disabled(void)   * off of preempt_enable. Kernel preemptions off return from interrupt   * occur there and call schedule directly.   */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void)  {  	/*  	 * If there is a non-zero preempt_count or interrupts are disabled, @@ -2829,7 +2866,7 @@ EXPORT_SYMBOL(preempt_schedule);   * Note, that this is called and return with irqs disabled. This will   * protect us against recursive calling from irq.   */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void)  {  	enum ctx_state prev_state; @@ -3012,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);  int can_nice(const struct task_struct *p, const int nice)  {  	/* convert nice value [19,-20] to rlimit style value [1,40] */ -	int nice_rlim = 20 - nice; +	int nice_rlim = nice_to_rlimit(nice);  	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||  		capable(CAP_SYS_NICE)); @@ -3036,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)  	 * We don't have to worry. Conceptually one call occurs first  	 * and we have a single winner.  	 */ -	if (increment < -40) -		increment = -40; -	if (increment > 40) -		increment = 40; - +	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);  	nice = task_nice(current) + increment; -	if (nice < MIN_NICE) -		nice = MIN_NICE; -	if (nice > MAX_NICE) -		nice = MAX_NICE; +	nice = clamp_val(nice, MIN_NICE, MAX_NICE);  	if (increment < 0 && !can_nice(current, nice))  		return -EPERM; @@ -3140,6 +3170,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)  	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);  	dl_se->dl_throttled = 0;  	dl_se->dl_new = 1; +	dl_se->dl_yielded = 0;  }  static void __setscheduler_params(struct task_struct *p, @@ -3204,17 +3235,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)   * We ask for the deadline not being zero, and greater or equal   * than the runtime, as well as the period of being zero or   * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero).   */  static bool  __checkparam_dl(const struct sched_attr *attr)  { -	return attr && attr->sched_deadline != 0 && -		(attr->sched_period == 0 || -		(s64)(attr->sched_period   - attr->sched_deadline) >= 0) && -		(s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  && -		attr->sched_runtime >= (2 << (DL_SCALE - 1)); +	/* deadline != 0 */ +	if (attr->sched_deadline == 0) +		return false; + +	/* +	 * Since we truncate DL_SCALE bits, make sure we're at least +	 * that big. +	 */ +	if (attr->sched_runtime < (1ULL << DL_SCALE)) +		return false; + +	/* +	 * Since we use the MSB for wrap-around and sign issues, make +	 * sure it's not set (mind that period can be equal to zero). +	 */ +	if (attr->sched_deadline & (1ULL << 63) || +	    attr->sched_period & (1ULL << 63)) +		return false; + +	/* runtime <= deadline <= period (if period != 0) */ +	if ((attr->sched_period != 0 && +	     attr->sched_period < attr->sched_deadline) || +	    attr->sched_deadline < attr->sched_runtime) +		return false; + +	return true;  }  /* @@ -3612,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,  	 */  	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: -	return ret; +	return 0;  err_size:  	put_user(sizeof(*attr), &uattr->size); -	ret = -E2BIG; -	goto out; +	return -E2BIG;  }  /** @@ -3655,6 +3707,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)   * sys_sched_setattr - same as above, but with extended sched_attr   * @pid: the pid in question.   * @uattr: structure containing the extended parameters. + * @flags: for future extension.   */  SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,  			       unsigned int, flags) @@ -3666,8 +3719,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,  	if (!uattr || pid < 0 || flags)  		return -EINVAL; -	if (sched_copy_attr(uattr, &attr)) -		return -EFAULT; +	retval = sched_copy_attr(uattr, &attr); +	if (retval) +		return retval; + +	if ((int)attr.sched_policy < 0) +		return -EINVAL;  	rcu_read_lock();  	retval = -ESRCH; @@ -3717,7 +3774,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)   */  SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  { -	struct sched_param lp; +	struct sched_param lp = { .sched_priority = 0 };  	struct task_struct *p;  	int retval; @@ -3734,11 +3791,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  	if (retval)  		goto out_unlock; -	if (task_has_dl_policy(p)) { -		retval = -EINVAL; -		goto out_unlock; -	} -	lp.sched_priority = p->rt_priority; +	if (task_has_rt_policy(p)) +		lp.sched_priority = p->rt_priority;  	rcu_read_unlock();  	/* @@ -3776,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,  		for (; addr < end; addr++) {  			if (*addr) -				goto err_size; +				return -EFBIG;  		}  		attr->size = usize; @@ -3786,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,  	if (ret)  		return -EFAULT; -out: -	return ret; - -err_size: -	ret = -E2BIG; -	goto out; +	return 0;  }  /** @@ -3799,6 +3848,7 @@ err_size:   * @pid: the pid in question.   * @uattr: structure containing the extended parameters.   * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension.   */  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,  		unsigned int, size, unsigned int, flags) @@ -4067,6 +4117,7 @@ static void __cond_resched(void)  int __sched _cond_resched(void)  { +	rcu_cond_resched();  	if (should_resched()) {  		__cond_resched();  		return 1; @@ -4085,15 +4136,18 @@ EXPORT_SYMBOL(_cond_resched);   */  int __cond_resched_lock(spinlock_t *lock)  { +	bool need_rcu_resched = rcu_should_resched();  	int resched = should_resched();  	int ret = 0;  	lockdep_assert_held(lock); -	if (spin_needbreak(lock) || resched) { +	if (spin_needbreak(lock) || resched || need_rcu_resched) {  		spin_unlock(lock);  		if (resched)  			__cond_resched(); +		else if (unlikely(need_rcu_resched)) +			rcu_resched();  		else  			cpu_relax();  		ret = 1; @@ -4107,6 +4161,7 @@ int __sched __cond_resched_softirq(void)  {  	BUG_ON(!in_softirq()); +	rcu_cond_resched();  /* BH disabled OK, just recording QSes. */  	if (should_resched()) {  		local_bh_enable();  		__cond_resched(); @@ -5055,11 +5110,20 @@ static struct notifier_block migration_notifier = {  	.priority = CPU_PRI_MIGRATION,  }; +static void __cpuinit set_cpu_rq_start_time(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	rq->age_stamp = sched_clock_cpu(cpu); +} +  static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_STARTING: +		set_cpu_rq_start_time(); +		return NOTIFY_OK;  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -5268,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)  			 SD_BALANCE_FORK |  			 SD_BALANCE_EXEC |  			 SD_SHARE_CPUPOWER | -			 SD_SHARE_PKG_RESOURCES)) { +			 SD_SHARE_PKG_RESOURCES | +			 SD_SHARE_POWERDOMAIN)) {  		if (sd->groups != sd->groups->next)  			return 0;  	} @@ -5299,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  				SD_BALANCE_EXEC |  				SD_SHARE_CPUPOWER |  				SD_SHARE_PKG_RESOURCES | -				SD_PREFER_SIBLING); +				SD_PREFER_SIBLING | +				SD_SHARE_POWERDOMAIN);  		if (nr_node_ids == 1)  			pflags &= ~SD_SERIALIZE;  	} @@ -5573,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ -	return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { -	struct sched_domain **__percpu sd; -	struct sched_group **__percpu sg; -	struct sched_group_power **__percpu sgp; -}; -  struct s_data {  	struct sched_domain ** __percpu sd;  	struct root_domain	*rd; @@ -5596,21 +5651,6 @@ enum s_alloc {  	sa_none,  }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP	0x01 - -struct sched_domain_topology_level { -	sched_domain_init_f init; -	sched_domain_mask_f mask; -	int		    flags; -	int		    numa_level; -	struct sd_data      data; -}; -  /*   * Build an iteration mask that can exclude certain CPUs from the upwards   * domain traversal. @@ -5778,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)  			continue;  		group = get_group(i, sdd, &sg); -		cpumask_clear(sched_group_cpus(sg)); -		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) { @@ -5829,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);  } -int __weak arch_sd_sibling_asym_packing(void) -{ -       return 0*SD_ASYM_PACKING; -} -  /*   * Initializers for schedule domains   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()   */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type)		sd->name = #type -#else -# define SD_INIT_NAME(sd, type)		do { } while (0) -#endif - -#define SD_INIT_FUNC(type)						\ -static noinline struct sched_domain *					\ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ -{									\ -	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ -	*sd = SD_##type##_INIT;						\ -	SD_INIT_NAME(sd, type);						\ -	sd->private = &tl->data;					\ -	return sd;							\ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif -  static int default_relax_domain_level = -1;  int sched_domain_level_max; @@ -5954,97 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)  		*per_cpu_ptr(sdd->sgp, cpu) = NULL;  } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ -	return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT -	{ sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC -	{ sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK -	{ sd_init_BOOK, cpu_book_mask, }, -#endif -	{ sd_init_CPU, cpu_cpu_mask, }, -	{ NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl)			\ -	for (tl = sched_domain_topology; tl->init; tl++) -  #ifdef CONFIG_NUMA -  static int sched_domains_numa_levels;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ -	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) -		return 0; - -	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER      - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA                - describes NUMA topologies + * SD_SHARE_POWERDOMAIN   - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING        - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS		\ +	(SD_SHARE_CPUPOWER |		\ +	 SD_SHARE_PKG_RESOURCES |	\ +	 SD_NUMA |			\ +	 SD_ASYM_PACKING |		\ +	 SD_SHARE_POWERDOMAIN)  static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu)  {  	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); -	int level = tl->numa_level; -	int sd_weight = cpumask_weight( -			sched_domains_numa_masks[level][cpu_to_node(cpu)]); +	int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; +#endif + +	sd_weight = cpumask_weight(tl->mask(cpu)); + +	if (tl->sd_flags) +		sd_flags = (*tl->sd_flags)(); +	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, +			"wrong sd_flags in topology description\n")) +		sd_flags &= ~TOPOLOGY_SD_FLAGS;  	*sd = (struct sched_domain){  		.min_interval		= sd_weight,  		.max_interval		= 2*sd_weight,  		.busy_factor		= 32,  		.imbalance_pct		= 125, -		.cache_nice_tries	= 2, -		.busy_idx		= 3, -		.idle_idx		= 2, + +		.cache_nice_tries	= 0, +		.busy_idx		= 0, +		.idle_idx		= 0,  		.newidle_idx		= 0,  		.wake_idx		= 0,  		.forkexec_idx		= 0,  		.flags			= 1*SD_LOAD_BALANCE  					| 1*SD_BALANCE_NEWIDLE -					| 0*SD_BALANCE_EXEC -					| 0*SD_BALANCE_FORK +					| 1*SD_BALANCE_EXEC +					| 1*SD_BALANCE_FORK  					| 0*SD_BALANCE_WAKE -					| 0*SD_WAKE_AFFINE +					| 1*SD_WAKE_AFFINE  					| 0*SD_SHARE_CPUPOWER  					| 0*SD_SHARE_PKG_RESOURCES -					| 1*SD_SERIALIZE +					| 0*SD_SERIALIZE  					| 0*SD_PREFER_SIBLING -					| 1*SD_NUMA -					| sd_local_flags(level) +					| 0*SD_NUMA +					| sd_flags  					, +  		.last_balance		= jiffies,  		.balance_interval	= sd_weight, +		.smt_gain		= 0, +		.max_newidle_lb_cost	= 0, +		.next_decay_max_lb_cost	= jiffies, +#ifdef CONFIG_SCHED_DEBUG +		.name			= tl->name, +#endif  	}; -	SD_INIT_NAME(sd, NUMA); -	sd->private = &tl->data;  	/* -	 * Ugly hack to pass state to sd_numa_mask()... +	 * Convert topological properties into behaviour.  	 */ -	sched_domains_curr_level = tl->numa_level; + +	if (sd->flags & SD_SHARE_CPUPOWER) { +		sd->imbalance_pct = 110; +		sd->smt_gain = 1178; /* ~15% */ + +	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { +		sd->imbalance_pct = 117; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; + +#ifdef CONFIG_NUMA +	} else if (sd->flags & SD_NUMA) { +		sd->cache_nice_tries = 2; +		sd->busy_idx = 3; +		sd->idle_idx = 2; + +		sd->flags |= SD_SERIALIZE; +		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { +			sd->flags &= ~(SD_BALANCE_EXEC | +				       SD_BALANCE_FORK | +				       SD_WAKE_AFFINE); +		} + +#endif +	} else { +		sd->flags |= SD_PREFER_SIBLING; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; +		sd->idle_idx = 1; +	} + +	sd->private = &tl->data;  	return sd;  } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC +	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) }, +	{ NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ +	sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA +  static const struct cpumask *sd_numa_mask(int cpu)  {  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6188,7 +6250,10 @@ static void sched_init_numa(void)  		}  	} -	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +	/* Compute default topology size */ +	for (i = 0; sched_domain_topology[i].mask; i++); + +	tl = kzalloc((i + level + 1) *  			sizeof(struct sched_domain_topology_level), GFP_KERNEL);  	if (!tl)  		return; @@ -6196,18 +6261,19 @@ static void sched_init_numa(void)  	/*  	 * Copy the default topology bits..  	 */ -	for (i = 0; default_topology[i].init; i++) -		tl[i] = default_topology[i]; +	for (i = 0; sched_domain_topology[i].mask; i++) +		tl[i] = sched_domain_topology[i];  	/*  	 * .. and append 'j' levels of NUMA goodness.  	 */  	for (j = 0; j < level; i++, j++) {  		tl[i] = (struct sched_domain_topology_level){ -			.init = sd_numa_init,  			.mask = sd_numa_mask, +			.sd_flags = cpu_numa_flags,  			.flags = SDTL_OVERLAP,  			.numa_level = j, +			SD_INIT_NAME(NUMA)  		};  	} @@ -6365,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = tl->init(tl, cpu); +	struct sched_domain *sd = sd_init(tl, cpu);  	if (!sd)  		return child; @@ -6935,6 +7001,7 @@ void __init sched_init(void)  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  	idle_thread_set_boot_cpu(); +	set_cpu_rq_start_time();  #endif  	init_sched_fair_class(); @@ -7602,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct task_group *tg = css_tg(css); -	struct task_group *parent = css_tg(css_parent(css)); +	struct task_group *parent = css_tg(css->parent);  	if (parent)  		sched_online_group(tg, parent); @@ -7733,8 +7800,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	/* restart the period timer (if active) to handle new period expiry */  	if (runtime_enabled && cfs_b->timer_active) {  		/* force a reprogram */ -		cfs_b->timer_active = 0; -		__start_cfs_bandwidth(cfs_b); +		__start_cfs_bandwidth(cfs_b, true);  	}  	raw_spin_unlock_irq(&cfs_b->lock); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index c143ee380e3a..9cf350c94ec4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)  static inline struct cpuacct *parent_ca(struct cpuacct *ca)  { -	return css_ca(css_parent(&ca->css)); +	return css_ca(ca->css.parent);  }  static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d47..bd95963dae80 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@  #include <linux/gfp.h>  #include <linux/kernel.h> +#include <linux/slab.h>  #include "cpudeadline.h"  static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)  {  	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; -	swap(cp->elements[a], cp->elements[b]); -	swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); +	swap(cp->elements[a].cpu, cp->elements[b].cpu); +	swap(cp->elements[a].dl , cp->elements[b].dl ); + +	swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);  }  static void cpudl_heapify(struct cpudl *cp, int idx) @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)  	WARN_ON(!cpu_present(cpu));  	raw_spin_lock_irqsave(&cp->lock, flags); -	old_idx = cp->cpu_to_idx[cpu]; +	old_idx = cp->elements[cpu].idx;  	if (!is_valid) {  		/* remove item */  		if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)  		cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;  		cp->elements[old_idx].cpu = new_cpu;  		cp->size--; -		cp->cpu_to_idx[new_cpu] = old_idx; -		cp->cpu_to_idx[cpu] = IDX_INVALID; +		cp->elements[new_cpu].idx = old_idx; +		cp->elements[cpu].idx = IDX_INVALID;  		while (old_idx > 0 && dl_time_before(  				cp->elements[parent(old_idx)].dl,  				cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)  		cp->size++;  		cp->elements[cp->size - 1].dl = 0;  		cp->elements[cp->size - 1].cpu = cpu; -		cp->cpu_to_idx[cpu] = cp->size - 1; +		cp->elements[cpu].idx = cp->size - 1;  		cpudl_change_key(cp, cp->size - 1, dl);  		cpumask_clear_cpu(cpu, cp->free_cpus);  	} else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)  	memset(cp, 0, sizeof(*cp));  	raw_spin_lock_init(&cp->lock);  	cp->size = 0; -	for (i = 0; i < NR_CPUS; i++) -		cp->cpu_to_idx[i] = IDX_INVALID; -	if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + +	cp->elements = kcalloc(nr_cpu_ids, +			       sizeof(struct cpudl_item), +			       GFP_KERNEL); +	if (!cp->elements) +		return -ENOMEM; + +	if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { +		kfree(cp->elements);  		return -ENOMEM; +	} + +	for_each_possible_cpu(i) +		cp->elements[i].idx = IDX_INVALID; +  	cpumask_setall(cp->free_cpus);  	return 0; @@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)   */  void cpudl_cleanup(struct cpudl *cp)  { -	/* -	 * nothing to do for the moment -	 */ +	free_cpumask_var(cp->free_cpus); +	kfree(cp->elements);  } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412c..538c9796ad4a 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@  #define IDX_INVALID     -1 -struct array_item { +struct cpudl_item {  	u64 dl;  	int cpu; +	int idx;  };  struct cpudl {  	raw_spinlock_t lock;  	int size; -	int cpu_to_idx[NR_CPUS]; -	struct array_item elements[NR_CPUS];  	cpumask_var_t free_cpus; +	struct cpudl_item *elements;  }; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 746bc9344969..981fcd7dc394 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@  #include <linux/gfp.h>  #include <linux/sched.h>  #include <linux/sched/rt.h> +#include <linux/slab.h>  #include "cpupri.h"  /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,  	int idx = 0;  	int task_pri = convert_prio(p->prio); -	if (task_pri >= MAX_RT_PRIO) -		return 0; +	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);  	for (idx = 0; idx < task_pri; idx++) {  		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx]; @@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)  			goto cleanup;  	} +	cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); +	if (!cp->cpu_to_pri) +		goto cleanup; +  	for_each_possible_cpu(i)  		cp->cpu_to_pri[i] = CPUPRI_INVALID; +  	return 0;  cleanup: @@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)  {  	int i; +	kfree(cp->cpu_to_pri);  	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)  		free_cpumask_var(cp->pri_to_cpu[i].mask);  } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d756173491..6b033347fdfd 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec {  struct cpupri {  	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; -	int               cpu_to_pri[NR_CPUS]; +	int *cpu_to_pri;  };  #ifdef CONFIG_SMP diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097cb4591..72fdf06ef865 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -332,50 +332,50 @@ out:   * softirq as those do not count in task exec_runtime any more.   */  static void irqtime_account_process_tick(struct task_struct *p, int user_tick, -						struct rq *rq) +					 struct rq *rq, int ticks)  { -	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); +	u64 cputime = (__force u64) cputime_one_jiffy;  	u64 *cpustat = kcpustat_this_cpu->cpustat;  	if (steal_account_process_tick())  		return; +	cputime *= ticks; +	scaled *= ticks; +  	if (irqtime_account_hi_update()) { -		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; +		cpustat[CPUTIME_IRQ] += cputime;  	} else if (irqtime_account_si_update()) { -		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; +		cpustat[CPUTIME_SOFTIRQ] += cputime;  	} else if (this_cpu_ksoftirqd() == p) {  		/*  		 * ksoftirqd time do not get accounted in cpu_softirq_time.  		 * So, we have to handle it separately here.  		 * Also, p->stime needs to be updated for ksoftirqd.  		 */ -		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, -					CPUTIME_SOFTIRQ); +		__account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);  	} else if (user_tick) { -		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +		account_user_time(p, cputime, scaled);  	} else if (p == rq->idle) { -		account_idle_time(cputime_one_jiffy); +		account_idle_time(cputime);  	} else if (p->flags & PF_VCPU) { /* System time or guest time */ -		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); +		account_guest_time(p, cputime, scaled);  	} else { -		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, -					CPUTIME_SYSTEM); +		__account_system_time(p, cputime, scaled,	CPUTIME_SYSTEM);  	}  }  static void irqtime_account_idle_ticks(int ticks)  { -	int i;  	struct rq *rq = this_rq(); -	for (i = 0; i < ticks; i++) -		irqtime_account_process_tick(current, 0, rq); +	irqtime_account_process_tick(current, 0, rq, ticks);  }  #else /* CONFIG_IRQ_TIME_ACCOUNTING */  static inline void irqtime_account_idle_ticks(int ticks) {}  static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, -						struct rq *rq) {} +						struct rq *rq, int nr_ticks) {}  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */  /* @@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)  		return;  	if (sched_clock_irqtime) { -		irqtime_account_process_tick(p, user_tick, rq); +		irqtime_account_process_tick(p, user_tick, rq, 1);  		return;  	} diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 27ef40925525..2b8cbf09d1a4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,  	 * entity.  	 */  	if (dl_time_before(dl_se->deadline, rq_clock(rq))) { -		static bool lag_once = false; - -		if (!lag_once) { -			lag_once = true; -			printk_sched("sched: DL replenish lagged to much\n"); -		} +		printk_deferred_once("sched: DL replenish lagged to much\n");  		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;  		dl_se->runtime = pi_se->dl_runtime;  	} @@ -513,14 +508,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  						     struct sched_dl_entity,  						     dl_timer);  	struct task_struct *p = dl_task_of(dl_se); -	struct rq *rq = task_rq(p); +	struct rq *rq; +again: +	rq = task_rq(p);  	raw_spin_lock(&rq->lock); +	if (rq != task_rq(p)) { +		/* Task was moved, retrying. */ +		raw_spin_unlock(&rq->lock); +		goto again; +	} +  	/*  	 * We need to take care of a possible races here. In fact, the  	 * task might have changed its scheduling policy to something  	 * different from SCHED_DEADLINE or changed its reservation -	 * parameters (through sched_setscheduler()). +	 * parameters (through sched_setattr()).  	 */  	if (!dl_task(p) || dl_se->dl_new)  		goto unlock; @@ -528,6 +531,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  	sched_clock_tick();  	update_rq_clock(rq);  	dl_se->dl_throttled = 0; +	dl_se->dl_yielded = 0;  	if (p->on_rq) {  		enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);  		if (task_has_dl_policy(rq->curr)) @@ -740,7 +744,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)  	WARN_ON(!dl_prio(prio));  	dl_rq->dl_nr_running++; -	inc_nr_running(rq_of_dl_rq(dl_rq)); +	add_nr_running(rq_of_dl_rq(dl_rq), 1);  	inc_dl_deadline(dl_rq, deadline);  	inc_dl_migration(dl_se, dl_rq); @@ -754,7 +758,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)  	WARN_ON(!dl_prio(prio));  	WARN_ON(!dl_rq->dl_nr_running);  	dl_rq->dl_nr_running--; -	dec_nr_running(rq_of_dl_rq(dl_rq)); +	sub_nr_running(rq_of_dl_rq(dl_rq), 1);  	dec_dl_deadline(dl_rq, dl_se->deadline);  	dec_dl_migration(dl_se, dl_rq); @@ -893,10 +897,10 @@ static void yield_task_dl(struct rq *rq)  	 * We make the task go to sleep until its current deadline by  	 * forcing its runtime to zero. This way, update_curr_dl() stops  	 * it and the bandwidth timer will wake it up and will give it -	 * new scheduling parameters (thanks to dl_new=1). +	 * new scheduling parameters (thanks to dl_yielded=1).  	 */  	if (p->dl.runtime > 0) { -		rq->curr->dl.dl_new = 1; +		rq->curr->dl.dl_yielded = 1;  		p->dl.runtime = 0;  	}  	update_curr_dl(rq); @@ -1021,8 +1025,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)  	dl_rq = &rq->dl; -	if (need_pull_dl_task(rq, prev)) +	if (need_pull_dl_task(rq, prev)) {  		pull_dl_task(rq); +		/* +		 * pull_rt_task() can drop (and re-acquire) rq->lock; this +		 * means a stop task can slip in, in which case we need to +		 * re-start task selection. +		 */ +		if (rq->stop && rq->stop->on_rq) +			return RETRY_TASK; +	} +  	/*  	 * When prev is DL, we may throttle it in put_prev_task().  	 * So, we update time before we check for dl_nr_running. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e9bd0b1fa9e..9855e87d671a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,  	env->best_cpu = env->dst_cpu;  } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, +				long src_load, long dst_load, +				struct task_numa_env *env) +{ +	long imb, old_imb; + +	/* We care about the slope of the imbalance, not the direction. */ +	if (dst_load < src_load) +		swap(dst_load, src_load); + +	/* Is the difference below the threshold? */ +	imb = dst_load * 100 - src_load * env->imbalance_pct; +	if (imb <= 0) +		return false; + +	/* +	 * The imbalance is above the allowed threshold. +	 * Compare it with the old imbalance. +	 */ +	if (orig_dst_load < orig_src_load) +		swap(orig_dst_load, orig_src_load); + +	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + +	/* Would this change make things worse? */ +	return (imb > old_imb); +} +  /*   * This checks if the overall compute and NUMA accesses of the system would   * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,  	struct rq *src_rq = cpu_rq(env->src_cpu);  	struct rq *dst_rq = cpu_rq(env->dst_cpu);  	struct task_struct *cur; -	long dst_load, src_load; +	long orig_src_load, src_load; +	long orig_dst_load, dst_load;  	long load;  	long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,  	 * In the overloaded case, try and keep the load balanced.  	 */  balance: -	dst_load = env->dst_stats.load; -	src_load = env->src_stats.load; +	orig_dst_load = env->dst_stats.load; +	orig_src_load = env->src_stats.load;  	/* XXX missing power terms */  	load = task_h_load(env->p); -	dst_load += load; -	src_load -= load; +	dst_load = orig_dst_load + load; +	src_load = orig_src_load - load;  	if (cur) {  		load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance:  		src_load += load;  	} -	/* make src_load the smaller */ -	if (dst_load < src_load) -		swap(dst_load, src_load); - -	if (src_load * env->imbalance_pct < dst_load * 100) +	if (load_too_imbalanced(orig_src_load, orig_dst_load, +				src_load, dst_load, env))  		goto unlock;  assign: @@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)  	if (env.best_cpu == -1)  		return -EAGAIN; -	sched_setnuma(p, env.dst_nid); +	/* +	 * If the task is part of a workload that spans multiple NUMA nodes, +	 * and is migrating into one of the workload's active nodes, remember +	 * this node as the task's preferred numa node, so the workload can +	 * settle down. +	 * A task that migrated to a second choice node will be better off +	 * trying for a better one later. Do not set the preferred node here. +	 */ +	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) +		sched_setnuma(p, env.dst_nid);  	/*  	 * Reset the scan period if the task is being rescheduled on an @@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)  /* Attempt to migrate a task to a CPU on the preferred node. */  static void numa_migrate_preferred(struct task_struct *p)  { +	unsigned long interval = HZ; +  	/* This task has no NUMA fault statistics yet */  	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))  		return;  	/* Periodically retry migrating the task to the preferred node */ -	p->numa_migrate_retry = jiffies + HZ; +	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); +	p->numa_migrate_retry = jiffies + interval;  	/* Success if task is already running on preferred CPU */  	if (task_node(p) == p->numa_preferred_nid) @@ -1497,7 +1535,7 @@ static void task_numa_placement(struct task_struct *p)  	/* If the task is part of a group prevent parallel updates to group stats */  	if (p->numa_group) {  		group_lock = &p->numa_group->lock; -		spin_lock(group_lock); +		spin_lock_irq(group_lock);  	}  	/* Find the node with the highest number of faults */ @@ -1572,7 +1610,7 @@ static void task_numa_placement(struct task_struct *p)  			}  		} -		spin_unlock(group_lock); +		spin_unlock_irq(group_lock);  	}  	/* Preferred node as the node with the most faults */ @@ -1677,7 +1715,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,  	if (!join)  		return; -	double_lock(&my_grp->lock, &grp->lock); +	BUG_ON(irqs_disabled()); +	double_lock_irq(&my_grp->lock, &grp->lock);  	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {  		my_grp->faults[i] -= p->numa_faults_memory[i]; @@ -1691,7 +1730,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,  	grp->nr_tasks++;  	spin_unlock(&my_grp->lock); -	spin_unlock(&grp->lock); +	spin_unlock_irq(&grp->lock);  	rcu_assign_pointer(p->numa_group, grp); @@ -1706,18 +1745,19 @@ no_join:  void task_numa_free(struct task_struct *p)  {  	struct numa_group *grp = p->numa_group; -	int i;  	void *numa_faults = p->numa_faults_memory; +	unsigned long flags; +	int i;  	if (grp) { -		spin_lock(&grp->lock); +		spin_lock_irqsave(&grp->lock, flags);  		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)  			grp->faults[i] -= p->numa_faults_memory[i];  		grp->total_faults -= p->total_numa_faults;  		list_del(&p->numa_entry);  		grp->nr_tasks--; -		spin_unlock(&grp->lock); +		spin_unlock_irqrestore(&grp->lock, flags);  		rcu_assign_pointer(p->numa_group, NULL);  		put_numa_group(grp);  	} @@ -1737,6 +1777,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	struct task_struct *p = current;  	bool migrated = flags & TNF_MIGRATED;  	int cpu_node = task_node(current); +	int local = !!(flags & TNF_FAULT_LOCAL);  	int priv;  	if (!numabalancing_enabled) @@ -1785,6 +1826,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  			task_numa_group(p, last_cpupid, flags, &priv);  	} +	/* +	 * If a workload spans multiple NUMA nodes, a shared fault that +	 * occurs wholly within the set of nodes that the workload is +	 * actively using should be counted as local. This allows the +	 * scan rate to slow down when a workload has settled down. +	 */ +	if (!priv && !local && p->numa_group && +			node_isset(cpu_node, p->numa_group->active_nodes) && +			node_isset(mem_node, p->numa_group->active_nodes)) +		local = 1; +  	task_numa_placement(p);  	/* @@ -1799,7 +1851,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;  	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; -	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; +	p->numa_faults_locality[local] += pages;  }  static void reset_ptenuma_scan(struct task_struct *p) @@ -3128,7 +3180,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  		 */  		if (!cfs_b->timer_active) {  			__refill_cfs_bandwidth_runtime(cfs_b); -			__start_cfs_bandwidth(cfs_b); +			__start_cfs_bandwidth(cfs_b, false);  		}  		if (cfs_b->runtime > 0) { @@ -3300,14 +3352,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running -= task_delta; +		sub_nr_running(rq, task_delta);  	cfs_rq->throttled = 1;  	cfs_rq->throttled_clock = rq_clock(rq);  	raw_spin_lock(&cfs_b->lock);  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	if (!cfs_b->timer_active) -		__start_cfs_bandwidth(cfs_b); +		__start_cfs_bandwidth(cfs_b, false);  	raw_spin_unlock(&cfs_b->lock);  } @@ -3351,7 +3403,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running += task_delta; +		add_nr_running(rq, task_delta);  	/* determine whether we need to wake up potentially idle cpu */  	if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3689,7 +3741,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)  }  /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)  {  	/*  	 * The timer may be active because we're trying to set a new bandwidth @@ -3704,7 +3756,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  		cpu_relax();  		raw_spin_lock(&cfs_b->lock);  		/* if someone else restarted the timer then we're done */ -		if (cfs_b->timer_active) +		if (!force && cfs_b->timer_active)  			return;  	} @@ -3883,7 +3935,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	if (!se) {  		update_rq_runnable_avg(rq, rq->nr_running); -		inc_nr_running(rq); +		add_nr_running(rq, 1);  	}  	hrtick_update(rq);  } @@ -3943,7 +3995,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	}  	if (!se) { -		dec_nr_running(rq); +		sub_nr_running(rq, 1);  		update_rq_runnable_avg(rq, 1);  	}  	hrtick_update(rq); @@ -4014,7 +4066,7 @@ static void record_wakee(struct task_struct *p)  	 * about the loss.  	 */  	if (jiffies > current->wakee_flip_decay_ts + HZ) { -		current->wakee_flips = 0; +		current->wakee_flips >>= 1;  		current->wakee_flip_decay_ts = jiffies;  	} @@ -4448,10 +4500,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f  			sd = tmp;  	} -	if (affine_sd) { -		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) -			prev_cpu = cpu; +	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) +		prev_cpu = cpu; +	if (sd_flag & SD_BALANCE_WAKE) {  		new_cpu = select_idle_sibling(p, prev_cpu);  		goto unlock;  	} @@ -4519,6 +4571,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)  		atomic_long_add(se->avg.load_avg_contrib,  						&cfs_rq->removed_load);  	} + +	/* We have migrated, no longer consider this task hot */ +	se->exec_start = 0;  }  #endif /* CONFIG_SMP */ @@ -5069,6 +5124,7 @@ task_hot(struct task_struct *p, u64 now)  /* Returns true if the destination node has incurred more faults */  static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)  { +	struct numa_group *numa_group = rcu_dereference(p->numa_group);  	int src_nid, dst_nid;  	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5082,21 +5138,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)  	if (src_nid == dst_nid)  		return false; -	/* Always encourage migration to the preferred node. */ -	if (dst_nid == p->numa_preferred_nid) -		return true; +	if (numa_group) { +		/* Task is already in the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return true; + +		return group_faults(p, dst_nid) > group_faults(p, src_nid); +	} -	/* If both task and group weight improve, this move is a winner. */ -	if (task_weight(p, dst_nid) > task_weight(p, src_nid) && -	    group_weight(p, dst_nid) > group_weight(p, src_nid)) +	/* Encourage migration to the preferred node. */ +	if (dst_nid == p->numa_preferred_nid)  		return true; -	return false; +	return task_faults(p, dst_nid) > task_faults(p, src_nid);  }  static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  { +	struct numa_group *numa_group = rcu_dereference(p->numa_group);  	int src_nid, dst_nid;  	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5111,16 +5175,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  	if (src_nid == dst_nid)  		return false; +	if (numa_group) { +		/* Task is moving within/into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving out of the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return true; + +		return group_faults(p, dst_nid) < group_faults(p, src_nid); +	} +  	/* Migrating away from the preferred node is always bad. */  	if (src_nid == p->numa_preferred_nid)  		return true; -	/* If either task or group weight get worse, don't do it. */ -	if (task_weight(p, dst_nid) < task_weight(p, src_nid) || -	    group_weight(p, dst_nid) < group_weight(p, src_nid)) -		return true; - -	return false; +	return task_faults(p, dst_nid) < task_faults(p, src_nid);  }  #else @@ -5563,6 +5634,7 @@ static unsigned long scale_rt_power(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	u64 total, available, age_stamp, avg; +	s64 delta;  	/*  	 * Since we're reading these variables without serialization make sure @@ -5571,7 +5643,11 @@ static unsigned long scale_rt_power(int cpu)  	age_stamp = ACCESS_ONCE(rq->age_stamp);  	avg = ACCESS_ONCE(rq->rt_avg); -	total = sched_avg_period() + (rq_clock(rq) - age_stamp); +	delta = rq_clock(rq) - age_stamp; +	if (unlikely(delta < 0)) +		delta = 0; + +	total = sched_avg_period() + delta;  	if (unlikely(total < avg)) {  		/* Ensures that power won't end up being negative */ @@ -6639,27 +6715,62 @@ out:  	return ld_moved;  } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ +	unsigned long interval = sd->balance_interval; + +	if (cpu_busy) +		interval *= sd->busy_factor; + +	/* scale ms to jiffies */ +	interval = msecs_to_jiffies(interval); +	interval = clamp(interval, 1UL, max_load_balance_interval); + +	return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ +	unsigned long interval, next; + +	interval = get_sd_balance_interval(sd, cpu_busy); +	next = sd->last_balance + interval; + +	if (time_after(*next_balance, next)) +		*next_balance = next; +} +  /*   * idle_balance is called by schedule() if this_cpu is about to become   * idle. Attempts to pull tasks from other CPUs.   */  static int idle_balance(struct rq *this_rq)  { +	unsigned long next_balance = jiffies + HZ; +	int this_cpu = this_rq->cpu;  	struct sched_domain *sd;  	int pulled_task = 0; -	unsigned long next_balance = jiffies + HZ;  	u64 curr_cost = 0; -	int this_cpu = this_rq->cpu;  	idle_enter_fair(this_rq); +  	/*  	 * We must set idle_stamp _before_ calling idle_balance(), such that we  	 * measure the duration of idle_balance() as idle time.  	 */  	this_rq->idle_stamp = rq_clock(this_rq); -	if (this_rq->avg_idle < sysctl_sched_migration_cost) +	if (this_rq->avg_idle < sysctl_sched_migration_cost) { +		rcu_read_lock(); +		sd = rcu_dereference_check_sched_domain(this_rq->sd); +		if (sd) +			update_next_balance(sd, 0, &next_balance); +		rcu_read_unlock(); +  		goto out; +	}  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6669,20 +6780,20 @@ static int idle_balance(struct rq *this_rq)  	update_blocked_averages(this_cpu);  	rcu_read_lock();  	for_each_domain(this_cpu, sd) { -		unsigned long interval;  		int continue_balancing = 1;  		u64 t0, domain_cost;  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; -		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) +		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { +			update_next_balance(sd, 0, &next_balance);  			break; +		}  		if (sd->flags & SD_BALANCE_NEWIDLE) {  			t0 = sched_clock_cpu(this_cpu); -			/* If we've pulled tasks over stop searching: */  			pulled_task = load_balance(this_cpu, this_rq,  						   sd, CPU_NEWLY_IDLE,  						   &continue_balancing); @@ -6694,41 +6805,37 @@ static int idle_balance(struct rq *this_rq)  			curr_cost += domain_cost;  		} -		interval = msecs_to_jiffies(sd->balance_interval); -		if (time_after(next_balance, sd->last_balance + interval)) -			next_balance = sd->last_balance + interval; -		if (pulled_task) +		update_next_balance(sd, 0, &next_balance); + +		/* +		 * Stop searching for tasks to pull if there are +		 * now runnable tasks on this rq. +		 */ +		if (pulled_task || this_rq->nr_running > 0)  			break;  	}  	rcu_read_unlock();  	raw_spin_lock(&this_rq->lock); +	if (curr_cost > this_rq->max_idle_balance_cost) +		this_rq->max_idle_balance_cost = curr_cost; +  	/* -	 * While browsing the domains, we released the rq lock. -	 * A task could have be enqueued in the meantime +	 * While browsing the domains, we released the rq lock, a task could +	 * have been enqueued in the meantime. Since we're not going idle, +	 * pretend we pulled a task.  	 */ -	if (this_rq->cfs.h_nr_running && !pulled_task) { +	if (this_rq->cfs.h_nr_running && !pulled_task)  		pulled_task = 1; -		goto out; -	} -	if (pulled_task || time_after(jiffies, this_rq->next_balance)) { -		/* -		 * We are going idle. next_balance may be set based on -		 * a busy processor. So reset next_balance. -		 */ +out: +	/* Move the next balance forward */ +	if (time_after(this_rq->next_balance, next_balance))  		this_rq->next_balance = next_balance; -	} - -	if (curr_cost > this_rq->max_idle_balance_cost) -		this_rq->max_idle_balance_cost = curr_cost; -out:  	/* Is there a task of a high priority class? */ -	if (this_rq->nr_running != this_rq->cfs.h_nr_running && -	    (this_rq->dl.dl_nr_running || -	     (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) +	if (this_rq->nr_running != this_rq->cfs.h_nr_running)  		pulled_task = -1;  	if (pulled_task) { @@ -7009,16 +7116,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  			break;  		} -		interval = sd->balance_interval; -		if (idle != CPU_IDLE) -			interval *= sd->busy_factor; - -		/* scale ms to jiffies */ -		interval = msecs_to_jiffies(interval); -		interval = clamp(interval, 1UL, max_load_balance_interval); +		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		need_serialize = sd->flags & SD_SERIALIZE; -  		if (need_serialize) {  			if (!spin_trylock(&balancing))  				goto out; @@ -7034,6 +7134,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;  			}  			sd->last_balance = jiffies; +			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		}  		if (need_serialize)  			spin_unlock(&balancing); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..25b9423abce9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)   * cpuidle_idle_call - the main idle function   *   * NOTE: no locks or semaphores should be used here - * return non-zero on failure   */ -static int cpuidle_idle_call(void) +static void cpuidle_idle_call(void)  {  	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);  	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); -	int next_state, entered_state, ret; +	int next_state, entered_state;  	bool broadcast;  	/*  	 * Check if the idle task must be rescheduled. If it is the -	 * case, exit the function after re-enabling the local irq and -	 * set again the polling flag +	 * case, exit the function after re-enabling the local irq.  	 */ -	if (current_clr_polling_and_test()) { +	if (need_resched()) {  		local_irq_enable(); -		__current_set_polling(); -		return 0; +		return;  	}  	/* @@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)  	rcu_idle_enter();  	/* -	 * Check if the cpuidle framework is ready, otherwise fallback -	 * to the default arch specific idle method +	 * Ask the cpuidle framework to choose a convenient idle state. +	 * Fall back to the default arch idle method on errors.  	 */ -	ret = cpuidle_enabled(drv, dev); - -	if (!ret) { +	next_state = cpuidle_select(drv, dev); +	if (next_state < 0) { +use_default:  		/* -		 * Ask the governor to choose an idle state it thinks -		 * it is convenient to go to. There is *always* a -		 * convenient idle state +		 * We can't use the cpuidle framework, let's use the default +		 * idle routine.  		 */ -		next_state = cpuidle_select(drv, dev); - -		/* -		 * The idle task must be scheduled, it is pointless to -		 * go to idle, just update no idle residency and get -		 * out of this function -		 */ -		if (current_clr_polling_and_test()) { -			dev->last_residency = 0; -			entered_state = next_state; +		if (current_clr_polling_and_test())  			local_irq_enable(); -		} else { -			broadcast = !!(drv->states[next_state].flags & -				       CPUIDLE_FLAG_TIMER_STOP); - -			if (broadcast) -				/* -				 * Tell the time framework to switch -				 * to a broadcast timer because our -				 * local timer will be shutdown. If a -				 * local timer is used from another -				 * cpu as a broadcast timer, this call -				 * may fail if it is not available -				 */ -				ret = clockevents_notify( -					CLOCK_EVT_NOTIFY_BROADCAST_ENTER, -					&dev->cpu); - -			if (!ret) { -				trace_cpu_idle_rcuidle(next_state, dev->cpu); - -				/* -				 * Enter the idle state previously -				 * returned by the governor -				 * decision. This function will block -				 * until an interrupt occurs and will -				 * take care of re-enabling the local -				 * interrupts -				 */ -				entered_state = cpuidle_enter(drv, dev, -							      next_state); - -				trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, -						       dev->cpu); - -				if (broadcast) -					clockevents_notify( -						CLOCK_EVT_NOTIFY_BROADCAST_EXIT, -						&dev->cpu); - -				/* -				 * Give the governor an opportunity to reflect on the -				 * outcome -				 */ -				cpuidle_reflect(dev, entered_state); -			} -		} +		else +			arch_cpu_idle(); + +		goto exit_idle;  	} +  	/* -	 * We can't use the cpuidle framework, let's use the default -	 * idle routine +	 * The idle task must be scheduled, it is pointless to +	 * go to idle, just update no idle residency and get +	 * out of this function  	 */ -	if (ret) -		arch_cpu_idle(); +	if (current_clr_polling_and_test()) { +		dev->last_residency = 0; +		entered_state = next_state; +		local_irq_enable(); +		goto exit_idle; +	} + +	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); +	/* +	 * Tell the time framework to switch to a broadcast timer +	 * because our local timer will be shutdown. If a local timer +	 * is used from another cpu as a broadcast timer, this call may +	 * fail if it is not available +	 */ +	if (broadcast && +	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) +		goto use_default; + +	trace_cpu_idle_rcuidle(next_state, dev->cpu); + +	/* +	 * Enter the idle state previously returned by the governor decision. +	 * This function will block until an interrupt occurs and will take +	 * care of re-enabling the local interrupts +	 */ +	entered_state = cpuidle_enter(drv, dev, next_state); + +	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + +	if (broadcast) +		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + +	/* +	 * Give the governor an opportunity to reflect on the outcome +	 */ +	cpuidle_reflect(dev, entered_state); + +exit_idle:  	__current_set_polling();  	/* -	 * It is up to the idle functions to enable back the local -	 * interrupt +	 * It is up to the idle functions to reenable local interrupts  	 */  	if (WARN_ON_ONCE(irqs_disabled()))  		local_irq_enable();  	rcu_idle_exit();  	start_critical_timings(); - -	return 0;  }  /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d8cdf1618551..b3512f1afce9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  	rt_rq->overloaded = 0;  	plist_head_init(&rt_rq->pushable_tasks);  #endif +	/* We start is dequeued state, because no RT tasks are queued */ +	rt_rq->rt_queued = 0;  	rt_rq->rt_time = 0;  	rt_rq->rt_throttled = 0; @@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)  	return rt_se->rt_rq;  } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *rt_rq = rt_se->rt_rq; + +	return rt_rq->rq; +} +  void free_rt_sched_group(struct task_group *tg)  {  	int i; @@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)  	return container_of(rt_rq, struct rq, rt);  } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)  {  	struct task_struct *p = rt_task_of(rt_se); -	struct rq *rq = task_rq(p); + +	return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	struct rq *rq = rq_of_rt_se(rt_se);  	return &rq->rt;  } @@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)  }  #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +  static inline int on_rt_rq(struct sched_rt_entity *rt_se)  {  	return !list_empty(&rt_se->run_list); @@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  	rt_se = rt_rq->tg->rt_se[cpu];  	if (rt_rq->rt_nr_running) { -		if (rt_se && !on_rt_rq(rt_se)) +		if (!rt_se) +			enqueue_top_rt_rq(rt_rq); +		else if (!on_rt_rq(rt_se))  			enqueue_rt_entity(rt_se, false); +  		if (rt_rq->highest_prio.curr < curr->prio)  			resched_task(curr);  	} @@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  	rt_se = rt_rq->tg->rt_se[cpu]; -	if (rt_se && on_rt_rq(rt_se)) +	if (!rt_se) +		dequeue_top_rt_rq(rt_rq); +	else if (on_rt_rq(rt_se))  		dequeue_rt_entity(rt_se);  } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +  static int rt_se_boosted(struct sched_rt_entity *rt_se)  {  	struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)  static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  { -	if (rt_rq->rt_nr_running) -		resched_task(rq_of_rt_rq(rt_rq)->curr); +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	if (!rt_rq->rt_nr_running) +		return; + +	enqueue_top_rt_rq(rt_rq); +	resched_task(rq->curr);  }  static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  { +	dequeue_top_rt_rq(rt_rq); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ +	return rt_rq->rt_throttled;  }  static inline const struct cpumask *sched_rt_period_mask(void) @@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)  		 * but accrue some time due to boosting.  		 */  		if (likely(rt_b->rt_runtime)) { -			static bool once = false; -  			rt_rq->rt_throttled = 1; - -			if (!once) { -				once = true; -				printk_sched("sched: RT throttling activated\n"); -			} +			printk_deferred_once("sched: RT throttling activated\n");  		} else {  			/*  			 * In case we did anyway, make it go away, @@ -922,6 +955,38 @@ static void update_curr_rt(struct rq *rq)  	}  } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	BUG_ON(&rq->rt != rt_rq); + +	if (!rt_rq->rt_queued) +		return; + +	BUG_ON(!rq->nr_running); + +	sub_nr_running(rq, rt_rq->rt_nr_running); +	rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	BUG_ON(&rq->rt != rt_rq); + +	if (rt_rq->rt_queued) +		return; +	if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) +		return; + +	add_nr_running(rq, rt_rq->rt_nr_running); +	rt_rq->rt_queued = 1; +} +  #if defined CONFIG_SMP  static void @@ -1045,12 +1110,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}  #endif /* CONFIG_RT_GROUP_SCHED */  static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *group_rq = group_rt_rq(rt_se); + +	if (group_rq) +		return group_rq->rt_nr_running; +	else +		return 1; +} + +static inline  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	int prio = rt_se_prio(rt_se);  	WARN_ON(!rt_prio(prio)); -	rt_rq->rt_nr_running++; +	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);  	inc_rt_prio(rt_rq, prio);  	inc_rt_migration(rt_se, rt_rq); @@ -1062,7 +1138,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	WARN_ON(!rt_prio(rt_se_prio(rt_se)));  	WARN_ON(!rt_rq->rt_nr_running); -	rt_rq->rt_nr_running--; +	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);  	dec_rt_prio(rt_rq, rt_se_prio(rt_se));  	dec_rt_migration(rt_se, rt_rq); @@ -1119,6 +1195,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  		back = rt_se;  	} +	dequeue_top_rt_rq(rt_rq_of_se(back)); +  	for (rt_se = back; rt_se; rt_se = rt_se->back) {  		if (on_rt_rq(rt_se))  			__dequeue_rt_entity(rt_se); @@ -1127,13 +1205,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)  { +	struct rq *rq = rq_of_rt_se(rt_se); +  	dequeue_rt_stack(rt_se);  	for_each_sched_rt_entity(rt_se)  		__enqueue_rt_entity(rt_se, head); +	enqueue_top_rt_rq(&rq->rt);  }  static void dequeue_rt_entity(struct sched_rt_entity *rt_se)  { +	struct rq *rq = rq_of_rt_se(rt_se); +  	dequeue_rt_stack(rt_se);  	for_each_sched_rt_entity(rt_se) { @@ -1142,6 +1225,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)  		if (rt_rq && rt_rq->rt_nr_running)  			__enqueue_rt_entity(rt_se, false);  	} +	enqueue_top_rt_rq(&rq->rt);  }  /* @@ -1159,8 +1243,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p); - -	inc_nr_running(rq);  }  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1171,8 +1253,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	dequeue_rt_entity(rt_se);  	dequeue_pushable_task(rq, p); - -	dec_nr_running(rq);  }  /* @@ -1362,10 +1442,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)  		pull_rt_task(rq);  		/*  		 * pull_rt_task() can drop (and re-acquire) rq->lock; this -		 * means a dl task can slip in, in which case we need to -		 * re-start task selection. +		 * means a dl or stop task can slip in, in which case we need +		 * to re-start task selection.  		 */ -		if (unlikely(rq->dl.dl_nr_running)) +		if (unlikely((rq->stop && rq->stop->on_rq) || +			     rq->dl.dl_nr_running))  			return RETRY_TASK;  	} @@ -1376,10 +1457,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)  	if (prev->sched_class == &rt_sched_class)  		update_curr_rt(rq); -	if (!rt_rq->rt_nr_running) -		return NULL; - -	if (rt_rq_throttled(rt_rq)) +	if (!rt_rq->rt_queued)  		return NULL;  	put_prev_task(rq, prev); @@ -1891,9 +1969,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)  	 */  	if (p->on_rq && rq->curr != p) {  #ifdef CONFIG_SMP -		if (rq->rt.overloaded && push_rt_task(rq) && +		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&  		    /* Don't resched if we changed runqueues */ -		    rq != task_rq(p)) +		    push_rt_task(rq) && rq != task_rq(p))  			check_resched = 0;  #endif /* CONFIG_SMP */  		if (check_resched && p->prio < rq->curr->prio) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c9007f28d3a2..e47679b04d16 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);  extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);  extern void free_rt_sched_group(struct task_group *tg); @@ -409,6 +409,8 @@ struct rt_rq {  	int overloaded;  	struct plist_head pushable_tasks;  #endif +	int rt_queued; +  	int rt_throttled;  	u64 rt_time;  	u64 rt_runtime; @@ -423,18 +425,6 @@ struct rt_rq {  #endif  }; -#ifdef CONFIG_RT_GROUP_SCHED -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ -	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} -#else -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ -	return rt_rq->rt_throttled; -} -#endif -  /* Deadline class' related fields in a runqueue */  struct dl_rq {  	/* runqueue is an rbtree, ordered by deadline */ @@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);  extern void init_task_runnable_average(struct task_struct *p); -static inline void inc_nr_running(struct rq *rq) +static inline void add_nr_running(struct rq *rq, unsigned count)  { -	rq->nr_running++; +	unsigned prev_nr = rq->nr_running; + +	rq->nr_running = prev_nr + count;  #ifdef CONFIG_NO_HZ_FULL -	if (rq->nr_running == 2) { +	if (prev_nr < 2 && rq->nr_running >= 2) {  		if (tick_nohz_full_cpu(rq->cpu)) {  			/* Order rq->nr_running write against the IPI */  			smp_wmb(); @@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)  #endif  } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count)  { -	rq->nr_running--; +	rq->nr_running -= count;  }  static inline void rq_last_tick_reset(struct rq *rq) @@ -1385,6 +1377,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)  	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);  } +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ +	if (l1 > l2) +		swap(l1, l2); + +	spin_lock_irq(l1); +	spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} +  static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)  {  	if (l1 > l2) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65dde541..bfe0edadbfbb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)  static void  enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)  { -	inc_nr_running(rq); +	add_nr_running(rq, 1);  }  static void  dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)  { -	dec_nr_running(rq); +	sub_nr_running(rq, 1);  }  static void yield_task_stop(struct rq *rq) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 590c37925084..301bbc24739c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -39,7 +39,7 @@   *         is only needed for handling filters shared across tasks.   * @prev: points to a previously installed, or inherited, filter   * @len: the number of instructions in the program - * @insns: the BPF program instructions to evaluate + * @insnsi: the BPF program instructions to evaluate   *   * seccomp_filter objects are organized in a tree linked via the @prev   * pointer.  For any task, it appears to be a singly-linked list starting @@ -54,8 +54,7 @@  struct seccomp_filter {  	atomic_t usage;  	struct seccomp_filter *prev; -	unsigned short len;  /* Instruction count */ -	struct sock_filter_int insnsi[]; +	struct sk_filter *prog;  };  /* Limit any path through the tree to 256KB worth of instructions. */ @@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)  		u32 k = ftest->k;  		switch (code) { -		case BPF_S_LD_W_ABS: +		case BPF_LD | BPF_W | BPF_ABS:  			ftest->code = BPF_LDX | BPF_W | BPF_ABS;  			/* 32-bit aligned and not out of bounds. */  			if (k >= sizeof(struct seccomp_data) || k & 3)  				return -EINVAL;  			continue; -		case BPF_S_LD_W_LEN: +		case BPF_LD | BPF_W | BPF_LEN:  			ftest->code = BPF_LD | BPF_IMM;  			ftest->k = sizeof(struct seccomp_data);  			continue; -		case BPF_S_LDX_W_LEN: +		case BPF_LDX | BPF_W | BPF_LEN:  			ftest->code = BPF_LDX | BPF_IMM;  			ftest->k = sizeof(struct seccomp_data);  			continue;  		/* Explicitly include allowed calls. */ -		case BPF_S_RET_K: -		case BPF_S_RET_A: -		case BPF_S_ALU_ADD_K: -		case BPF_S_ALU_ADD_X: -		case BPF_S_ALU_SUB_K: -		case BPF_S_ALU_SUB_X: -		case BPF_S_ALU_MUL_K: -		case BPF_S_ALU_MUL_X: -		case BPF_S_ALU_DIV_X: -		case BPF_S_ALU_AND_K: -		case BPF_S_ALU_AND_X: -		case BPF_S_ALU_OR_K: -		case BPF_S_ALU_OR_X: -		case BPF_S_ALU_XOR_K: -		case BPF_S_ALU_XOR_X: -		case BPF_S_ALU_LSH_K: -		case BPF_S_ALU_LSH_X: -		case BPF_S_ALU_RSH_K: -		case BPF_S_ALU_RSH_X: -		case BPF_S_ALU_NEG: -		case BPF_S_LD_IMM: -		case BPF_S_LDX_IMM: -		case BPF_S_MISC_TAX: -		case BPF_S_MISC_TXA: -		case BPF_S_ALU_DIV_K: -		case BPF_S_LD_MEM: -		case BPF_S_LDX_MEM: -		case BPF_S_ST: -		case BPF_S_STX: -		case BPF_S_JMP_JA: -		case BPF_S_JMP_JEQ_K: -		case BPF_S_JMP_JEQ_X: -		case BPF_S_JMP_JGE_K: -		case BPF_S_JMP_JGE_X: -		case BPF_S_JMP_JGT_K: -		case BPF_S_JMP_JGT_X: -		case BPF_S_JMP_JSET_K: -		case BPF_S_JMP_JSET_X: -			sk_decode_filter(ftest, ftest); +		case BPF_RET | BPF_K: +		case BPF_RET | BPF_A: +		case BPF_ALU | BPF_ADD | BPF_K: +		case BPF_ALU | BPF_ADD | BPF_X: +		case BPF_ALU | BPF_SUB | BPF_K: +		case BPF_ALU | BPF_SUB | BPF_X: +		case BPF_ALU | BPF_MUL | BPF_K: +		case BPF_ALU | BPF_MUL | BPF_X: +		case BPF_ALU | BPF_DIV | BPF_K: +		case BPF_ALU | BPF_DIV | BPF_X: +		case BPF_ALU | BPF_AND | BPF_K: +		case BPF_ALU | BPF_AND | BPF_X: +		case BPF_ALU | BPF_OR | BPF_K: +		case BPF_ALU | BPF_OR | BPF_X: +		case BPF_ALU | BPF_XOR | BPF_K: +		case BPF_ALU | BPF_XOR | BPF_X: +		case BPF_ALU | BPF_LSH | BPF_K: +		case BPF_ALU | BPF_LSH | BPF_X: +		case BPF_ALU | BPF_RSH | BPF_K: +		case BPF_ALU | BPF_RSH | BPF_X: +		case BPF_ALU | BPF_NEG: +		case BPF_LD | BPF_IMM: +		case BPF_LDX | BPF_IMM: +		case BPF_MISC | BPF_TAX: +		case BPF_MISC | BPF_TXA: +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM: +		case BPF_ST: +		case BPF_STX: +		case BPF_JMP | BPF_JA: +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X:  			continue;  		default:  			return -EINVAL; @@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)  	 * value always takes priority (ignoring the DATA).  	 */  	for (f = current->seccomp.filter; f; f = f->prev) { -		u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); +		u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); +  		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))  			ret = cur_ret;  	} @@ -215,12 +214,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  		return -EINVAL;  	for (filter = current->seccomp.filter; filter; filter = filter->prev) -		total_insns += filter->len + 4;  /* include a 4 instr penalty */ +		total_insns += filter->prog->len + 4;  /* include a 4 instr penalty */  	if (total_insns > MAX_INSNS_PER_PATH)  		return -ENOMEM;  	/* -	 * Installing a seccomp filter requires that the task have +	 * Installing a seccomp filter requires that the task has  	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.  	 * This avoids scenarios where unprivileged tasks can affect the  	 * behavior of privileged children. @@ -255,18 +254,26 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  		goto free_prog;  	/* Allocate a new seccomp_filter */ -	filter = kzalloc(sizeof(struct seccomp_filter) + -			 sizeof(struct sock_filter_int) * new_len, +	ret = -ENOMEM; +	filter = kzalloc(sizeof(struct seccomp_filter),  			 GFP_KERNEL|__GFP_NOWARN);  	if (!filter)  		goto free_prog; -	ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); -	if (ret) +	filter->prog = kzalloc(sk_filter_size(new_len), +			       GFP_KERNEL|__GFP_NOWARN); +	if (!filter->prog)  		goto free_filter; +	ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); +	if (ret) +		goto free_filter_prog; +	kfree(fp); +  	atomic_set(&filter->usage, 1); -	filter->len = new_len; +	filter->prog->len = new_len; + +	sk_filter_select_runtime(filter->prog);  	/*  	 * If there is an existing filter, make it the prev and don't drop its @@ -276,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  	current->seccomp.filter = filter;  	return 0; +free_filter_prog: +	kfree(filter->prog);  free_filter:  	kfree(filter);  free_prog: @@ -328,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)  	while (orig && atomic_dec_and_test(&orig->usage)) {  		struct seccomp_filter *freeme = orig;  		orig = orig->prev; +		sk_filter_free(freeme->prog);  		kfree(freeme);  	}  } diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..a4077e90f19f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)  {  	if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {  		task->jobctl &= ~JOBCTL_TRAPPING; +		smp_mb();	/* advised by wake_up_bit() */  		wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);  	}  } @@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)   * Returns 1 if any signals were found.   *   * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word.   */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)  {  	struct sigqueue *q, *n;  	sigset_t m; @@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)  	}  	return 1;  } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ -	struct sigqueue *q, *n; - -	if (!sigtestsetmask(&s->signal, mask)) -		return 0; - -	sigdelsetmask(&s->signal, mask); -	list_for_each_entry_safe(q, n, &s->list, list) { -		if (q->info.si_signo < SIGRTMIN && -		    (mask & sigmask(q->info.si_signo))) { -			list_del_init(&q->list); -			__sigqueue_free(q); -		} -	} -	return 1; -}  static inline int is_si_special(const struct siginfo *info)  { @@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)  {  	struct signal_struct *signal = p->signal;  	struct task_struct *t; +	sigset_t flush;  	if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {  		if (signal->flags & SIGNAL_GROUP_COREDUMP) @@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)  		/*  		 * This is a stop signal.  Remove SIGCONT from all queues.  		 */ -		rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); -		t = p; -		do { -			rm_from_queue(sigmask(SIGCONT), &t->pending); -		} while_each_thread(p, t); +		siginitset(&flush, sigmask(SIGCONT)); +		flush_sigqueue_mask(&flush, &signal->shared_pending); +		for_each_thread(p, t) +			flush_sigqueue_mask(&flush, &t->pending);  	} else if (sig == SIGCONT) {  		unsigned int why;  		/*  		 * Remove all stop signals from all queues, wake all threads.  		 */ -		rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); -		t = p; -		do { +		siginitset(&flush, SIG_KERNEL_STOP_MASK); +		flush_sigqueue_mask(&flush, &signal->shared_pending); +		for_each_thread(p, t) { +			flush_sigqueue_mask(&flush, &t->pending);  			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); -			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);  			if (likely(!(t->ptrace & PT_SEIZED)))  				wake_up_state(t, __TASK_STOPPED);  			else  				ptrace_trap_notify(t); -		} while_each_thread(p, t); +		}  		/*  		 * Notify the parent with CLD_CONTINUED if we were stopped. @@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  		spin_lock_irq(&tsk->sighand->siglock);  		__set_task_blocked(tsk, &tsk->real_blocked); -		siginitset(&tsk->real_blocked, 0); +		sigemptyset(&tsk->real_blocked);  		sig = dequeue_signal(tsk, &mask, info);  	}  	spin_unlock_irq(&tsk->sighand->siglock); @@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,  }  #endif +/* + * For kthreads only, must not be used if cloned with CLONE_SIGHAND + */ +void kernel_sigaction(int sig, __sighandler_t action) +{ +	spin_lock_irq(¤t->sighand->siglock); +	current->sighand->action[sig - 1].sa.sa_handler = action; +	if (action == SIG_IGN) { +		sigset_t mask; + +		sigemptyset(&mask); +		sigaddset(&mask, sig); + +		flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); +		flush_sigqueue_mask(&mask, ¤t->pending); +		recalc_sigpending(); +	} +	spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(kernel_sigaction); +  int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)  { -	struct task_struct *t = current; +	struct task_struct *p = current, *t;  	struct k_sigaction *k;  	sigset_t mask;  	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))  		return -EINVAL; -	k = &t->sighand->action[sig-1]; +	k = &p->sighand->action[sig-1]; -	spin_lock_irq(¤t->sighand->siglock); +	spin_lock_irq(&p->sighand->siglock);  	if (oact)  		*oact = *k; @@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)  		 *   (for example, SIGCHLD), shall cause the pending signal to  		 *   be discarded, whether or not it is blocked"  		 */ -		if (sig_handler_ignored(sig_handler(t, sig), sig)) { +		if (sig_handler_ignored(sig_handler(p, sig), sig)) {  			sigemptyset(&mask);  			sigaddset(&mask, sig); -			rm_from_queue_full(&mask, &t->signal->shared_pending); -			do { -				rm_from_queue_full(&mask, &t->pending); -			} while_each_thread(current, t); +			flush_sigqueue_mask(&mask, &p->signal->shared_pending); +			for_each_thread(p, t) +				flush_sigqueue_mask(&mask, &t->pending);  		}  	} -	spin_unlock_irq(¤t->sighand->siglock); +	spin_unlock_irq(&p->sighand->siglock);  	return 0;  } -static int  +static int  do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)  {  	stack_t oss; @@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,  }  #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL  /*   * For backwards compatibility.  Functionality superseded by sigprocmask. @@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)  	return old;  } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */  #ifdef __ARCH_WANT_SYS_SIGNAL  /* diff --git a/kernel/smp.c b/kernel/smp.c index 06d574e42c72..306f8180b0d5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void)  {  	struct llist_node *entry;  	struct call_single_data *csd, *csd_next; +	static bool warned; + +	entry = llist_del_all(&__get_cpu_var(call_single_queue)); +	entry = llist_reverse_order(entry);  	/*  	 * Shouldn't receive this interrupt on a cpu that is not yet online.  	 */ -	WARN_ON_ONCE(!cpu_online(smp_processor_id())); +	if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { +		warned = true; +		WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); -	entry = llist_del_all(&__get_cpu_var(call_single_queue)); -	entry = llist_reverse_order(entry); +		/* +		 * We don't have to use the _safe() variant here +		 * because we are not invoking the IPI handlers yet. +		 */ +		llist_for_each_entry(csd, entry, llist) +			pr_warn("IPI callback %pS sent to offline CPU\n", +				csd->func); +	}  	llist_for_each_entry_safe(csd, csd_next, entry, llist) {  		csd->func(csd->info); diff --git a/kernel/softirq.c b/kernel/softirq.c index b50990a5bea0..5918d227730f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }  static inline void lockdep_softirq_end(bool in_hardirq) { }  #endif -asmlinkage void __do_softirq(void) +asmlinkage __visible void __do_softirq(void)  {  	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;  	unsigned long old_flags = current->flags; @@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void)  	bool in_hardirq;  	__u32 pending;  	int softirq_bit; -	int cpu;  	/*  	 * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void)  	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);  	in_hardirq = lockdep_softirq_start(); -	cpu = smp_processor_id();  restart:  	/* Reset the pending bitmask before enabling irqs */  	set_softirq_pending(0); @@ -276,11 +274,11 @@ restart:  			       prev_count, preempt_count());  			preempt_count_set(prev_count);  		} -		rcu_bh_qs(cpu);  		h++;  		pending >>= softirq_bit;  	} +	rcu_bh_qs(smp_processor_id());  	local_irq_disable();  	pending = local_softirq_pending(); @@ -299,7 +297,7 @@ restart:  	tsk_restore_flags(current, old_flags, PF_MEMALLOC);  } -asmlinkage void do_softirq(void) +asmlinkage __visible void do_softirq(void)  {  	__u32 pending;  	unsigned long flags; @@ -779,3 +777,8 @@ int __init __weak arch_early_irq_init(void)  {  	return 0;  } + +unsigned int __weak arch_dynirq_lower_bound(unsigned int from) +{ +	return from; +} diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *   * @cpu: cpu to stop   * @fn: function to execute   * @arg: argument to @fn + * @work_buf: pointer to cpu_stop_work structure   *   * Similar to stop_one_cpu() but doesn't wait for completion.  The   * caller is responsible for ensuring @work_buf is currently unused diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)  			else  				p = current;  			if (p) { -				niceval = 20 - task_nice(p); +				niceval = nice_to_rlimit(task_nice(p));  				if (niceval > retval)  					retval = niceval;  			} @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)  			else  				pgrp = task_pgrp(current);  			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { -				niceval = 20 - task_nice(p); +				niceval = nice_to_rlimit(task_nice(p));  				if (niceval > retval)  					retval = niceval;  			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)  			do_each_thread(g, p) {  				if (uid_eq(task_uid(p), uid)) { -					niceval = 20 - task_nice(p); +					niceval = nice_to_rlimit(task_nice(p));  					if (niceval > retval)  						retval = niceval;  				} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);  cond_syscall(sys_setresuid16);  cond_syscall(sys_setreuid16);  cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask);  cond_syscall(sys_vm86old);  cond_syscall(sys_vm86);  cond_syscall(sys_ipc); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..ba9ed453c4ed 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,6 +173,13 @@ extern int no_unaligned_warning;  #endif  #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY	-1 +#define SYSCTL_WRITES_WARN	 0 +#define SYSCTL_WRITES_STRICT	 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +  static int proc_do_cad_pid(struct ctl_table *table, int write,  		  void __user *buffer, size_t *lenp, loff_t *ppos);  static int proc_taint(struct ctl_table *table, int write, @@ -195,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,  /* Note: sysrq code uses it's own private copy */  static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write,  				void __user *buffer, size_t *lenp,  				loff_t *ppos)  { @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_taint,  	}, +	{ +		.procname	= "sysctl_writes_strict", +		.data		= &sysctl_writes_strict, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &neg_one, +		.extra2		= &one, +	},  #endif  #ifdef CONFIG_LATENCYTOP  	{ @@ -643,7 +659,7 @@ static struct ctl_table kern_table[] = {  		.extra2		= &one,  	},  #endif - +#ifdef CONFIG_UEVENT_HELPER  	{  		.procname	= "hotplug",  		.data		= &uevent_helper, @@ -651,7 +667,7 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dostring,  	}, - +#endif  #ifdef CONFIG_CHR_DEV_SG  	{  		.procname	= "sg-big-buff", @@ -1418,8 +1434,13 @@ static struct ctl_table vm_table[] = {     (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))  	{  		.procname	= "vdso_enabled", +#ifdef CONFIG_X86_32 +		.data		= &vdso32_enabled, +		.maxlen		= sizeof(vdso32_enabled), +#else  		.data		= &vdso_enabled,  		.maxlen		= sizeof(vdso_enabled), +#endif  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  		.extra1		= &zero, @@ -1698,8 +1719,8 @@ int __init sysctl_init(void)  #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, -			   void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, +			   char __user *buffer,  			   size_t *lenp, loff_t *ppos)  {  	size_t len; @@ -1712,21 +1733,30 @@ static int _proc_do_string(void* data, int maxlen, int write,  	}  	if (write) { -		len = 0; +		if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { +			/* Only continue writes not past the end of buffer. */ +			len = strlen(data); +			if (len > maxlen - 1) +				len = maxlen - 1; + +			if (*ppos > len) +				return 0; +			len = *ppos; +		} else { +			/* Start writing from beginning of buffer. */ +			len = 0; +		} + +		*ppos += *lenp;  		p = buffer; -		while (len < *lenp) { +		while ((p - buffer) < *lenp && len < maxlen - 1) {  			if (get_user(c, p++))  				return -EFAULT;  			if (c == 0 || c == '\n')  				break; -			len++; +			data[len++] = c;  		} -		if (len >= maxlen) -			len = maxlen-1; -		if(copy_from_user(data, buffer, len)) -			return -EFAULT; -		((char *) data)[len] = 0; -		*ppos += *lenp; +		data[len] = 0;  	} else {  		len = strlen(data);  		if (len > maxlen) @@ -1743,10 +1773,10 @@ static int _proc_do_string(void* data, int maxlen, int write,  		if (len > *lenp)  			len = *lenp;  		if (len) -			if(copy_to_user(buffer, data, len)) +			if (copy_to_user(buffer, data, len))  				return -EFAULT;  		if (len < *lenp) { -			if(put_user('\n', ((char __user *) buffer) + len)) +			if (put_user('\n', buffer + len))  				return -EFAULT;  			len++;  		} @@ -1756,6 +1786,14 @@ static int _proc_do_string(void* data, int maxlen, int write,  	return 0;  } +static void warn_sysctl_write(struct ctl_table *table) +{ +	pr_warn_once("%s wrote to %s when file position was not 0!\n" +		"This will not be supported in the future. To silence this\n" +		"warning, set kernel.sysctl_writes_strict = -1\n", +		current->comm, table->procname); +} +  /**   * proc_dostring - read a string sysctl   * @table: the sysctl table @@ -1776,8 +1814,11 @@ static int _proc_do_string(void* data, int maxlen, int write,  int proc_dostring(struct ctl_table *table, int write,  		  void __user *buffer, size_t *lenp, loff_t *ppos)  { -	return _proc_do_string(table->data, table->maxlen, write, -			       buffer, lenp, ppos); +	if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) +		warn_sysctl_write(table); + +	return _proc_do_string((char *)(table->data), table->maxlen, write, +			       (char __user *)buffer, lenp, ppos);  }  static size_t proc_skip_spaces(char **buf) @@ -1951,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,  		conv = do_proc_dointvec_conv;  	if (write) { +		if (*ppos) { +			switch (sysctl_writes_strict) { +			case SYSCTL_WRITES_STRICT: +				goto out; +			case SYSCTL_WRITES_WARN: +				warn_sysctl_write(table); +				break; +			default: +				break; +			} +		} +  		if (left > PAGE_SIZE - 1)  			left = PAGE_SIZE - 1;  		page = __get_free_page(GFP_TEMPORARY); @@ -2008,6 +2061,7 @@ free:  			return err ? : -EINVAL;  	}  	*lenp -= left; +out:  	*ppos += *lenp;  	return err;  } @@ -2200,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int  	left = *lenp;  	if (write) { +		if (*ppos) { +			switch (sysctl_writes_strict) { +			case SYSCTL_WRITES_STRICT: +				goto out; +			case SYSCTL_WRITES_WARN: +				warn_sysctl_write(table); +				break; +			default: +				break; +			} +		} +  		if (left > PAGE_SIZE - 1)  			left = PAGE_SIZE - 1;  		page = __get_free_page(GFP_TEMPORARY); @@ -2255,6 +2321,7 @@ free:  			return err ? : -EINVAL;  	}  	*lenp -= left; +out:  	*ppos += *lenp;  	return err;  } @@ -2501,11 +2568,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  	bool first = 1;  	size_t left = *lenp;  	unsigned long bitmap_len = table->maxlen; -	unsigned long *bitmap = (unsigned long *) table->data; +	unsigned long *bitmap = *(unsigned long **) table->data;  	unsigned long *tmp_bitmap = NULL;  	char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; -	if (!bitmap_len || !left || (*ppos && !write)) { +	if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {  		*lenp = 0;  		return 0;  	} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..33db43a39515 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)  static inline int is_error_status(int status)  { -	return (time_status & (STA_UNSYNC|STA_CLOCKERR)) +	return (status & (STA_UNSYNC|STA_CLOCKERR))  		/* PPS signal lost when either PPS time or  		 * PPS frequency synchronization requested  		 */ -		|| ((time_status & (STA_PPSFREQ|STA_PPSTIME)) -			&& !(time_status & STA_PPSSIGNAL)) +		|| ((status & (STA_PPSFREQ|STA_PPSTIME)) +			&& !(status & STA_PPSSIGNAL))  		/* PPS jitter exceeded when  		 * PPS time synchronization requested */ -		|| ((time_status & (STA_PPSTIME|STA_PPSJITTER)) +		|| ((status & (STA_PPSTIME|STA_PPSJITTER))  			== (STA_PPSTIME|STA_PPSJITTER))  		/* PPS wander exceeded or calibration error when  		 * PPS frequency synchronization requested  		 */ -		|| ((time_status & STA_PPSFREQ) -			&& (time_status & (STA_PPSWANDER|STA_PPSERROR))); +		|| ((status & STA_PPSFREQ) +			&& (status & (STA_PPSWANDER|STA_PPSERROR)));  }  static inline void pps_fill_timex(struct timex *txc) @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)  		time_status |= STA_PPSERROR;  		pps_errcnt++;  		pps_dec_freq_interval(); -		pr_err("hardpps: PPSERROR: interval too long - %ld s\n", -				freq_norm.sec); +		printk_deferred(KERN_ERR +			"hardpps: PPSERROR: interval too long - %ld s\n", +			freq_norm.sec);  		return 0;  	} @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)  	delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);  	pps_freq = ftemp;  	if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { -		pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); +		printk_deferred(KERN_WARNING +				"hardpps: PPSWANDER: change=%ld\n", delta);  		time_status |= STA_PPSWANDER;  		pps_stbcnt++;  		pps_dec_freq_interval(); @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error)  	 * the time offset is updated.  	 */  	if (jitter > (pps_jitter << PPS_POPCORN)) { -		pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", -		       jitter, (pps_jitter << PPS_POPCORN)); +		printk_deferred(KERN_WARNING +				"hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", +				jitter, (pps_jitter << PPS_POPCORN));  		time_status |= STA_PPSJITTER;  		pps_jitcnt++;  	} else if (time_status & STA_PPSTIME) { @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  		time_status |= STA_PPSJITTER;  		/* restart the frequency calibration interval */  		pps_fbase = *raw_ts; -		pr_err("hardpps: PPSJITTER: bad pulse\n"); +		printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");  		return;  	} @@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  static int __init ntp_tick_adj_setup(char *str)  { -	ntp_tick_adj = simple_strtol(str, NULL, 0); +	int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + +	if (rc) +		return rc;  	ntp_tick_adj <<= NTP_SCALE_SHIFT;  	return 1; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 4d23dc4d8139..445106d2c729 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)  	return (u64)(jiffies - INITIAL_JIFFIES);  } -static u32 __read_mostly (*read_sched_clock_32)(void); - -static u64 notrace read_sched_clock_32_wrapper(void) -{ -	return read_sched_clock_32(); -} -  static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;  static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,  	pr_debug("Registered %pF as sched_clock source\n", read);  } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ -	read_sched_clock_32 = read; -	sched_clock_register(read_sched_clock_32_wrapper, bits, rate); -} -  void __init sched_clock_postinit(void)  {  	/* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 015661279b68..0a0608edeb26 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,  bool tick_check_replacement(struct clock_event_device *curdev,  			    struct clock_event_device *newdev)  { -	if (tick_check_percpu(curdev, newdev, smp_processor_id())) +	if (!tick_check_percpu(curdev, newdev, smp_processor_id()))  		return false;  	return tick_check_preferred(curdev, newdev); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9f8af69c67ec..6558b7ac112d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now)  		/* Keep the tick_next_period variable up to date */  		tick_next_period = ktime_add(last_jiffies_update, tick_period); +	} else { +		write_sequnlock(&jiffies_lock); +		return;  	}  	write_sequnlock(&jiffies_lock);  	update_wall_time(); @@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);  	ktime_t next; -	if (!tick_nohz_active) +	if (!tick_nohz_enabled)  		return;  	local_irq_disable(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,  							struct timespec *delta)  {  	if (!timespec_valid_strict(delta)) { -		printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " -					"sleep delta value!\n"); +		printk_deferred(KERN_WARNING +				"__timekeeping_inject_sleeptime: Invalid " +				"sleep delta value!\n");  		return;  	}  	tk_xtime_add(tk, delta); @@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)  	if (unlikely(tk->clock->maxadj &&  		(tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { -		printk_once(KERN_WARNING +		printk_deferred_once(KERN_WARNING  			"Adjusting %s more than 11%% (%ld vs %ld)\n",  			tk->clock->name, (long)tk->mult + adj,  			(long)tk->clock->mult + tk->clock->maxadj); diff --git a/kernel/timer.c b/kernel/timer.c index 87bd529879c2..3bb01a323b2a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)  	bit = find_last_bit(&mask, BITS_PER_LONG); -	mask = (1 << bit) - 1; +	mask = (1UL << bit) - 1;  	expires_limit = expires_limit & ~(mask); diff --git a/kernel/torture.c b/kernel/torture.c index acc9afc2f26e..40bb511cca48 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void)  	shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);  	if (shuffle_idle_cpu >= nr_cpu_ids)  		shuffle_idle_cpu = -1; -	if (shuffle_idle_cpu != -1) { +	else  		cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); -		if (cpumask_empty(shuffle_tmp_mask)) { -			put_online_cpus(); -			return; -		} -	}  	mutex_lock(&shuffle_task_mutex);  	list_for_each_entry(stp, &shuffle_task_list, st_l) @@ -533,7 +528,11 @@ void stutter_wait(const char *title)  	while (ACCESS_ONCE(stutter_pause_test) ||  	       (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {  		if (stutter_pause_test) -			schedule_timeout_interruptible(1); +			if (ACCESS_ONCE(stutter_pause_test) == 1) +				schedule_timeout_interruptible(1); +			else +				while (ACCESS_ONCE(stutter_pause_test)) +					cond_resched();  		else  			schedule_timeout_interruptible(round_jiffies_relative(HZ));  		torture_shutdown_absorb(title); @@ -550,7 +549,11 @@ static int torture_stutter(void *arg)  	VERBOSE_TOROUT_STRING("torture_stutter task started");  	do {  		if (!torture_must_stop()) { -			schedule_timeout_interruptible(stutter); +			if (stutter > 1) { +				schedule_timeout_interruptible(stutter - 1); +				ACCESS_ONCE(stutter_pause_test) = 2; +			} +			schedule_timeout_interruptible(1);  			ACCESS_ONCE(stutter_pause_test) = 1;  		}  		if (!torture_must_stop()) @@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void)   * The runnable parameter points to a flag that controls whether or not   * the test is currently runnable.  If there is no such flag, pass in NULL.   */ -void __init torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v, int *runnable)  {  	mutex_lock(&fullstop_mutex); +	if (torture_type != NULL) { +		pr_alert("torture_init_begin: refusing %s init: %s running", +			 ttype, torture_type); +		mutex_unlock(&fullstop_mutex); +		return false; +	}  	torture_type = ttype;  	verbose = v;  	torture_runnable = runnable;  	fullstop = FULLSTOP_DONTSTOP; - +	return true;  }  EXPORT_SYMBOL_GPL(torture_init_begin);  /*   * Tell the torture module that initialization is complete.   */ -void __init torture_init_end(void) +void torture_init_end(void)  {  	mutex_unlock(&fullstop_mutex);  	register_reboot_notifier(&torture_shutdown_nb); @@ -642,6 +651,9 @@ bool torture_cleanup(void)  	torture_shuffle_cleanup();  	torture_stutter_cleanup();  	torture_onoff_cleanup(); +	mutex_lock(&fullstop_mutex); +	torture_type = NULL; +	mutex_unlock(&fullstop_mutex);  	return false;  }  EXPORT_SYMBOL_GPL(torture_cleanup); @@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq);   */  void torture_kthread_stopping(char *title)  { -	if (verbose) -		VERBOSE_TOROUT_STRING(title); +	char buf[128]; + +	snprintf(buf, sizeof(buf), "Stopping %s", title); +	VERBOSE_TOROUT_STRING(buf);  	while (!kthread_should_stop()) {  		torture_shutdown_absorb(title);  		schedule_timeout_uninterruptible(1); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8639819f6cef..d4409356f40d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -535,6 +535,36 @@ config MMIOTRACE_TEST  	  Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK +        bool "Add tracepoint that benchmarks tracepoints" +	help +	 This option creates the tracepoint "benchmark:benchmark_event". +	 When the tracepoint is enabled, it kicks off a kernel thread that +	 goes into an infinite loop (calling cond_sched() to let other tasks +	 run), and calls the tracepoint. Each iteration will record the time +	 it took to write to the tracepoint and the next iteration that +	 data will be passed to the tracepoint itself. That is, the tracepoint +	 will report the time it took to do the previous tracepoint. +	 The string written to the tracepoint is a static string of 128 bytes +	 to keep the time the same. The initial string is simply a write of +	 "START". The second string records the cold cache time of the first +	 write which is not added to the rest of the calculations. + +	 As it is a tight loop, it benchmarks as hot cache. That's fine because +	 we care most about hot paths that are probably in cache already. + +	 An example of the output: + +	      START +	      first=3672 [COLD CACHED] +	      last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 +	      last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 +	      last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 +	      last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 +	      last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 +	      last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + +  config RING_BUFFER_BENCHMARK  	tristate "Ring buffer benchmark stress tester"  	depends on RING_BUFFER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe39..2611613f14f1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES  KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING  endif +CFLAGS_trace_benchmark.o := -I$(src)  CFLAGS_trace_events_filter.o := -I$(src)  obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o @@ -62,4 +63,6 @@ endif  obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o  obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +  libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1fd4b9479210..5b372e3ed675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,7 +62,7 @@  #define FTRACE_HASH_DEFAULT_BITS 10  #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)  #ifdef CONFIG_DYNAMIC_FTRACE  #define INIT_REGEX_LOCK(opsname)	\ @@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;  static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;  static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;  static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;  ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; @@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)  	return cnt;  } -static void -ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, -			struct ftrace_ops *op, struct pt_regs *regs) -{ -	int bit; - -	bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); -	if (bit < 0) -		return; - -	do_for_each_ftrace_op(op, ftrace_global_list) { -		op->func(ip, parent_ip, op, regs); -	} while_for_each_ftrace_op(op); - -	trace_clear_recursion(bit); -} -  static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,  			    struct ftrace_ops *op, struct pt_regs *regs)  { @@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)  	return 0;  } -static void update_global_ops(void) -{ -	ftrace_func_t func = ftrace_global_list_func; -	void *private = NULL; - -	/* The list has its own recursion protection. */ -	global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - -	/* -	 * If there's only one function registered, then call that -	 * function directly. Otherwise, we need to iterate over the -	 * registered callers. -	 */ -	if (ftrace_global_list == &ftrace_list_end || -	    ftrace_global_list->next == &ftrace_list_end) { -		func = ftrace_global_list->func; -		private = ftrace_global_list->private; -		/* -		 * As we are calling the function directly. -		 * If it does not have recursion protection, -		 * the function_trace_op needs to be updated -		 * accordingly. -		 */ -		if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) -			global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; -	} - -	/* If we filter on pids, update to use the pid function */ -	if (!list_empty(&ftrace_pids)) { -		set_ftrace_pid_function(func); -		func = ftrace_pid_func; -	} - -	global_ops.func = func; -	global_ops.private = private; -} -  static void ftrace_sync(struct work_struct *work)  {  	/* @@ -301,8 +246,6 @@ static void update_ftrace_function(void)  {  	ftrace_func_t func; -	update_global_ops(); -  	/*  	 * If we are at the end of the list and this ops is  	 * recursion safe and not dynamic and the arch supports passing ops, @@ -314,10 +257,7 @@ static void update_ftrace_function(void)  	     (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&  	     !FTRACE_FORCE_LIST_FUNC)) {  		/* Set the ftrace_ops that the arch callback uses */ -		if (ftrace_ops_list == &global_ops) -			set_function_trace_op = ftrace_global_list; -		else -			set_function_trace_op = ftrace_ops_list; +		set_function_trace_op = ftrace_ops_list;  		func = ftrace_ops_list->func;  	} else {  		/* Just use the default ftrace_ops */ @@ -373,6 +313,11 @@ static void update_ftrace_function(void)  	ftrace_trace_function = func;  } +int using_ftrace_ops_list_func(void) +{ +	return ftrace_trace_function == ftrace_ops_list_func; +} +  static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)  {  	ops->next = *list; @@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)  	if (ops->flags & FTRACE_OPS_FL_DELETED)  		return -EINVAL; -	if (FTRACE_WARN_ON(ops == &global_ops)) -		return -EINVAL; -  	if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))  		return -EBUSY; -	/* We don't support both control and global flags set. */ -	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) -		return -EINVAL; -  #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS  	/*  	 * If the ftrace_ops specifies SAVE_REGS, then it only can be used @@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)  	if (!core_kernel_data((unsigned long)ops))  		ops->flags |= FTRACE_OPS_FL_DYNAMIC; -	if (ops->flags & FTRACE_OPS_FL_GLOBAL) { -		add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); -		ops->flags |= FTRACE_OPS_FL_ENABLED; -	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) { +	if (ops->flags & FTRACE_OPS_FL_CONTROL) {  		if (control_ops_alloc(ops))  			return -ENOMEM;  		add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); @@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))  		return -EBUSY; -	if (FTRACE_WARN_ON(ops == &global_ops)) -		return -EINVAL; - -	if (ops->flags & FTRACE_OPS_FL_GLOBAL) { -		ret = remove_ftrace_list_ops(&ftrace_global_list, -					     &global_ops, ops); -		if (!ret) -			ops->flags &= ~FTRACE_OPS_FL_ENABLED; -	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) { +	if (ops->flags & FTRACE_OPS_FL_CONTROL) {  		ret = remove_ftrace_list_ops(&ftrace_control_list,  					     &control_ops, ops);  	} else @@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,  	local_irq_save(flags); -	stat = &__get_cpu_var(ftrace_profile_stats); +	stat = this_cpu_ptr(&ftrace_profile_stats);  	if (!stat->hash || !ftrace_profile_enabled)  		goto out; @@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)  	unsigned long flags;  	local_irq_save(flags); -	stat = &__get_cpu_var(ftrace_profile_stats); +	stat = this_cpu_ptr(&ftrace_profile_stats);  	if (!stat->hash || !ftrace_profile_enabled)  		goto out; @@ -1178,7 +1105,7 @@ struct ftrace_page {  static struct ftrace_page	*ftrace_pages_start;  static struct ftrace_page	*ftrace_pages; -static bool ftrace_hash_empty(struct ftrace_hash *hash) +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)  {  	return !hash || !hash->count;  } @@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,  			in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);  			/* +			 * If filter_hash is set, we want to match all functions +			 * that are in the hash but not in the other hash.  			 * +			 * If filter_hash is not set, then we are decrementing. +			 * That means we match anything that is in the hash +			 * and also in the other_hash. That is, we need to turn +			 * off functions in the other hash because they are disabled +			 * by this hash.  			 */  			if (filter_hash && in_hash && !in_other_hash)  				match = 1; @@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)  		/*  		 * If this record is being updated from a nop, then  		 *   return UPDATE_MAKE_CALL. -		 * Otherwise, if the EN flag is set, then return -		 *   UPDATE_MODIFY_CALL_REGS to tell the caller to convert -		 *   from the non-save regs, to a save regs function.  		 * Otherwise,  		 *   return UPDATE_MODIFY_CALL to tell the caller to convert -		 *   from the save regs, to a non-save regs function. +		 *   from the save regs, to a non-save regs function or +		 *   vice versa.  		 */  		if (flag & FTRACE_FL_ENABLED)  			return FTRACE_UPDATE_MAKE_CALL; -		else if (rec->flags & FTRACE_FL_REGS_EN) -			return FTRACE_UPDATE_MODIFY_CALL_REGS; -		else -			return FTRACE_UPDATE_MODIFY_CALL; + +		return FTRACE_UPDATE_MODIFY_CALL;  	}  	if (update) { @@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)  	return ftrace_check_record(rec, enable, 0);  } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec:  The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ +	if (rec->flags & FTRACE_FL_REGS) +		return (unsigned long)FTRACE_REGS_ADDR; +	else +		return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec:  The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ +	if (rec->flags & FTRACE_FL_REGS_EN) +		return (unsigned long)FTRACE_REGS_ADDR; +	else +		return (unsigned long)FTRACE_ADDR; +} +  static int  __ftrace_replace_code(struct dyn_ftrace *rec, int enable)  { @@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)  	unsigned long ftrace_addr;  	int ret; -	ret = ftrace_update_record(rec, enable); +	ftrace_addr = ftrace_get_addr_new(rec); -	if (rec->flags & FTRACE_FL_REGS) -		ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; -	else -		ftrace_addr = (unsigned long)FTRACE_ADDR; +	/* This needs to be done before we call ftrace_update_record */ +	ftrace_old_addr = ftrace_get_addr_curr(rec); + +	ret = ftrace_update_record(rec, enable);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE: @@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)  	case FTRACE_UPDATE_MAKE_NOP:  		return ftrace_make_nop(NULL, rec, ftrace_addr); -	case FTRACE_UPDATE_MODIFY_CALL_REGS:  	case FTRACE_UPDATE_MODIFY_CALL: -		if (rec->flags & FTRACE_FL_REGS) -			ftrace_old_addr = (unsigned long)FTRACE_ADDR; -		else -			ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; -  		return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);  	} @@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command)  static int ftrace_startup(struct ftrace_ops *ops, int command)  { -	bool hash_enable = true;  	int ret;  	if (unlikely(ftrace_disabled)) @@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)  	ftrace_start_up++;  	command |= FTRACE_UPDATE_CALLS; -	/* ops marked global share the filter hashes */ -	if (ops->flags & FTRACE_OPS_FL_GLOBAL) { -		ops = &global_ops; -		/* Don't update hash if global is already set */ -		if (global_start_up) -			hash_enable = false; -		global_start_up++; -	} -  	ops->flags |= FTRACE_OPS_FL_ENABLED; -	if (hash_enable) -		ftrace_hash_rec_enable(ops, 1); + +	ftrace_hash_rec_enable(ops, 1);  	ftrace_startup_enable(command); @@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)  static int ftrace_shutdown(struct ftrace_ops *ops, int command)  { -	bool hash_disable = true;  	int ret;  	if (unlikely(ftrace_disabled)) @@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)  	 */  	WARN_ON_ONCE(ftrace_start_up < 0); -	if (ops->flags & FTRACE_OPS_FL_GLOBAL) { -		ops = &global_ops; -		global_start_up--; -		WARN_ON_ONCE(global_start_up < 0); -		/* Don't update hash if global still has users */ -		if (global_start_up) { -			WARN_ON_ONCE(!ftrace_start_up); -			hash_disable = false; -		} -	} +	ftrace_hash_rec_disable(ops, 1); -	if (hash_disable) -		ftrace_hash_rec_disable(ops, 1); - -	if (ops != &global_ops || !global_start_up) +	if (!global_start_up)  		ops->flags &= ~FTRACE_OPS_FL_ENABLED;  	command |= FTRACE_UPDATE_CALLS; @@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,  	struct ftrace_hash *hash;  	int ret; -	/* All global ops uses the global ops filters */ -	if (ops->flags & FTRACE_OPS_FL_GLOBAL) -		ops = &global_ops; -  	if (unlikely(ftrace_disabled))  		return -ENODEV; @@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,  }  EXPORT_SYMBOL_GPL(ftrace_set_notrace);  /** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with + * ftrace_set_global_filter - set a function to filter on with global tracers   * @buf - the string that holds the function filter text.   * @len - the length of the string.   * @reset - non zero to reset all filters before applying this filter. @@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)  EXPORT_SYMBOL_GPL(ftrace_set_global_filter);  /** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with + * ftrace_set_global_notrace - set a function to not trace with global tracers   * @buf - the string that holds the function notrace text.   * @len - the length of the string.   * @reset - non zero to reset all filters before applying this filter. @@ -4330,16 +4261,11 @@ static void ftrace_init_module(struct module *mod,  	ftrace_process_locs(mod, start, end);  } -static int ftrace_module_notify_enter(struct notifier_block *self, -				      unsigned long val, void *data) +void ftrace_module_init(struct module *mod)  { -	struct module *mod = data; - -	if (val == MODULE_STATE_COMING) -		ftrace_init_module(mod, mod->ftrace_callsites, -				   mod->ftrace_callsites + -				   mod->num_ftrace_callsites); -	return 0; +	ftrace_init_module(mod, mod->ftrace_callsites, +			   mod->ftrace_callsites + +			   mod->num_ftrace_callsites);  }  static int ftrace_module_notify_exit(struct notifier_block *self, @@ -4353,11 +4279,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,  	return 0;  }  #else -static int ftrace_module_notify_enter(struct notifier_block *self, -				      unsigned long val, void *data) -{ -	return 0; -}  static int ftrace_module_notify_exit(struct notifier_block *self,  				     unsigned long val, void *data)  { @@ -4365,11 +4286,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,  }  #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_enter_nb = { -	.notifier_call = ftrace_module_notify_enter, -	.priority = INT_MAX,	/* Run before anything that can use kprobes */ -}; -  struct notifier_block ftrace_module_exit_nb = {  	.notifier_call = ftrace_module_notify_exit,  	.priority = INT_MIN,	/* Run after anything that can remove kprobes */ @@ -4403,10 +4319,6 @@ void __init ftrace_init(void)  				  __start_mcount_loc,  				  __stop_mcount_loc); -	ret = register_module_notifier(&ftrace_module_enter_nb); -	if (ret) -		pr_warning("Failed to register trace ftrace module enter notifier\n"); -  	ret = register_module_notifier(&ftrace_module_exit_nb);  	if (ret)  		pr_warning("Failed to register trace ftrace module exit notifier\n"); @@ -4462,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)  #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ +	tr->ops = &global_ops; +	tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ +	/* If we filter on pids, update to use the pid function */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { +		if (WARN_ON(tr->ops->func != ftrace_stub)) +			printk("ftrace ops had %pS for function\n", +			       tr->ops->func); +		/* Only the top level instance does pid tracing */ +		if (!list_empty(&ftrace_pids)) { +			set_ftrace_pid_function(func); +			func = ftrace_pid_func; +		} +	} +	tr->ops->func = func; +	tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ +	tr->ops->func = ftrace_stub; +} +  static void  ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  			struct ftrace_ops *op, struct pt_regs *regs) @@ -4520,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  	 */  	preempt_disable_notrace();  	do_for_each_ftrace_op(op, ftrace_ops_list) { -		if (ftrace_ops_test(op, ip, regs)) +		if (ftrace_ops_test(op, ip, regs)) { +			if (WARN_ON(!op->func)) { +				function_trace_stop = 1; +				printk("op=%p %pS\n", op, op); +				goto out; +			}  			op->func(ip, parent_ip, op, regs); +		}  	} while_for_each_ftrace_op(op); +out:  	preempt_enable_notrace();  	trace_clear_recursion(bit);  } @@ -4927,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier;  int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)  { @@ -5073,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,  	return NOTIFY_DONE;  } -/* Just a place holder for function graph */ -static struct ftrace_ops fgraph_ops __read_mostly = { -	.func		= ftrace_stub, -	.flags		= FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | -				FTRACE_OPS_FL_RECURSION_SAFE, -}; -  static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)  {  	if (!ftrace_ops_test(&global_ops, trace->func, NULL)) @@ -5104,6 +5043,10 @@ static void update_function_graph_func(void)  		ftrace_graph_entry = ftrace_graph_entry_test;  } +static struct notifier_block ftrace_suspend_notifier = { +	.notifier_call = ftrace_suspend_notifier_call, +}; +  int register_ftrace_graph(trace_func_graph_ret_t retfunc,  			trace_func_graph_ent_t entryfunc)  { @@ -5117,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  		goto out;  	} -	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;  	register_pm_notifier(&ftrace_suspend_notifier);  	ftrace_graph_active++; @@ -5139,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  	ftrace_graph_entry = ftrace_graph_entry_test;  	update_function_graph_func(); -	ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); +	/* Function graph doesn't use the .func field of global_ops */ +	global_ops.flags |= FTRACE_OPS_FL_STUB; + +	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);  out:  	mutex_unlock(&ftrace_lock); @@ -5157,7 +5102,8 @@ void unregister_ftrace_graph(void)  	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;  	ftrace_graph_entry = ftrace_graph_entry_stub;  	__ftrace_graph_entry = ftrace_graph_entry_stub; -	ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); +	ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); +	global_ops.flags &= ~FTRACE_OPS_FL_STUB;  	unregister_pm_notifier(&ftrace_suspend_notifier);  	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 737b0efa1a62..16f7038d1f4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,  }  EXPORT_SYMBOL_GPL(call_filter_check_discard); -cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)  {  	u64 ts; @@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr)  	return 0;  } -void free_snapshot(struct trace_array *tr) +static void free_snapshot(struct trace_array *tr)  {  	/*  	 * We don't free the ring buffer. instead, resize it because @@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  	return cnt;  } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = -	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -  unsigned long __read_mostly	tracing_thresh;  #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly	tracing_max_latency; -  /*   * Copy the new maximum trace into the separate maximum-trace   * structure. (this way the maximum trace is permanently saved, @@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  	max_buf->cpu = cpu;  	max_buf->time_start = data->preempt_timestamp; -	max_data->saved_latency = tracing_max_latency; +	max_data->saved_latency = tr->max_latency;  	max_data->critical_start = data->critical_start;  	max_data->critical_end = data->critical_end; @@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  		return;  	} -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&tr->max_lock);  	buf = tr->trace_buffer.buffer;  	tr->trace_buffer.buffer = tr->max_buffer.buffer;  	tr->max_buffer.buffer = buf;  	__update_max_tr(tr, tsk, cpu); -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&tr->max_lock);  }  /** @@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  		return;  	} -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&tr->max_lock);  	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); @@ -1099,11 +1081,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  	WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);  	__update_max_tr(tr, tsk, cpu); -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&tr->max_lock);  }  #endif /* CONFIG_TRACER_MAX_TRACE */ -static void default_wait_pipe(struct trace_iterator *iter) +static void wait_on_pipe(struct trace_iterator *iter)  {  	/* Iterators are static, they should be filled or empty */  	if (trace_buffer_iter(iter, iter->cpu_file)) @@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type)  	else  		if (!type->flags->opts)  			type->flags->opts = dummy_tracer_opt; -	if (!type->wait_pipe) -		type->wait_pipe = default_wait_pipe;  	ret = run_tracer_selftest(type);  	if (ret < 0) @@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void)  	}  } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128  #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx;  static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { +	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +	unsigned *map_cmdline_to_pid; +	unsigned cmdline_num; +	int cmdline_idx; +	char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd;  /* temporary disable recording */  static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx)  { -	memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); -	memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); -	cmdline_idx = 0; +	return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ +	memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, +				    struct saved_cmdlines_buffer *s) +{ +	s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), +					GFP_KERNEL); +	if (!s->map_cmdline_to_pid) +		return -ENOMEM; + +	s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); +	if (!s->saved_cmdlines) { +		kfree(s->map_cmdline_to_pid); +		return -ENOMEM; +	} + +	s->cmdline_idx = 0; +	s->cmdline_num = val; +	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, +	       sizeof(s->map_pid_to_cmdline)); +	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, +	       val * sizeof(*s->map_cmdline_to_pid)); + +	return 0; +} + +static int trace_create_savedcmd(void) +{ +	int ret; + +	savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); +	if (!savedcmd) +		return -ENOMEM; + +	ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); +	if (ret < 0) { +		kfree(savedcmd); +		savedcmd = NULL; +		return -ENOMEM; +	} + +	return 0;  }  int is_tracing_stopped(void) @@ -1353,7 +1382,7 @@ void tracing_start(void)  	}  	/* Prevent the buffers from switching */ -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&global_trace.max_lock);  	buffer = global_trace.trace_buffer.buffer;  	if (buffer) @@ -1365,7 +1394,7 @@ void tracing_start(void)  		ring_buffer_record_enable(buffer);  #endif -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&global_trace.max_lock);  	ftrace_start();   out: @@ -1420,7 +1449,7 @@ void tracing_stop(void)  		goto out;  	/* Prevent the buffers from switching */ -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&global_trace.max_lock);  	buffer = global_trace.trace_buffer.buffer;  	if (buffer) @@ -1432,7 +1461,7 @@ void tracing_stop(void)  		ring_buffer_record_disable(buffer);  #endif -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&global_trace.max_lock);   out:  	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); @@ -1461,12 +1490,12 @@ static void tracing_stop_tr(struct trace_array *tr)  void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk)  {  	unsigned pid, idx;  	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) -		return; +		return 0;  	/*  	 * It's not the end of the world if we don't get @@ -1475,11 +1504,11 @@ static void trace_save_cmdline(struct task_struct *tsk)  	 * so if we miss here, then better luck next time.  	 */  	if (!arch_spin_trylock(&trace_cmdline_lock)) -		return; +		return 0; -	idx = map_pid_to_cmdline[tsk->pid]; +	idx = savedcmd->map_pid_to_cmdline[tsk->pid];  	if (idx == NO_CMDLINE_MAP) { -		idx = (cmdline_idx + 1) % SAVED_CMDLINES; +		idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;  		/*  		 * Check whether the cmdline buffer at idx has a pid @@ -1487,22 +1516,24 @@ static void trace_save_cmdline(struct task_struct *tsk)  		 * need to clear the map_pid_to_cmdline. Otherwise we  		 * would read the new comm for the old pid.  		 */ -		pid = map_cmdline_to_pid[idx]; +		pid = savedcmd->map_cmdline_to_pid[idx];  		if (pid != NO_CMDLINE_MAP) -			map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; +			savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; -		map_cmdline_to_pid[idx] = tsk->pid; -		map_pid_to_cmdline[tsk->pid] = idx; +		savedcmd->map_cmdline_to_pid[idx] = tsk->pid; +		savedcmd->map_pid_to_cmdline[tsk->pid] = idx; -		cmdline_idx = idx; +		savedcmd->cmdline_idx = idx;  	} -	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); +	set_cmdline(idx, tsk->comm);  	arch_spin_unlock(&trace_cmdline_lock); + +	return 1;  } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[])  {  	unsigned map; @@ -1521,13 +1552,19 @@ void trace_find_cmdline(int pid, char comm[])  		return;  	} -	preempt_disable(); -	arch_spin_lock(&trace_cmdline_lock); -	map = map_pid_to_cmdline[pid]; +	map = savedcmd->map_pid_to_cmdline[pid];  	if (map != NO_CMDLINE_MAP) -		strcpy(comm, saved_cmdlines[map]); +		strcpy(comm, get_saved_cmdlines(map));  	else  		strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ +	preempt_disable(); +	arch_spin_lock(&trace_cmdline_lock); + +	__trace_find_cmdline(pid, comm);  	arch_spin_unlock(&trace_cmdline_lock);  	preempt_enable(); @@ -1541,9 +1578,8 @@ void tracing_record_cmdline(struct task_struct *tsk)  	if (!__this_cpu_read(trace_cmdline_save))  		return; -	__this_cpu_write(trace_cmdline_save, false); - -	trace_save_cmdline(tsk); +	if (trace_save_cmdline(tsk)) +		__this_cpu_write(trace_cmdline_save, false);  }  void @@ -1746,7 +1782,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	 */  	barrier();  	if (use_stack == 1) { -		trace.entries		= &__get_cpu_var(ftrace_stack).calls[0]; +		trace.entries		= this_cpu_ptr(ftrace_stack.calls);  		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES;  		if (regs) @@ -1995,7 +2031,21 @@ void trace_printk_init_buffers(void)  	if (alloc_percpu_trace_buffer())  		return; -	pr_info("ftrace: Allocated trace_printk buffers\n"); +	/* trace_printk() is for debug use only. Don't use it in production. */ + +	pr_warning("\n**********************************************************\n"); +	pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("** trace_printk() being used. Allocating extra memory.  **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("** This means that this is a DEBUG kernel and it is     **\n"); +	pr_warning("** unsafe for produciton use.                           **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("** If you see this message and you are not debugging    **\n"); +	pr_warning("** the kernel, report this immediately to your vendor!  **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n"); +	pr_warning("**********************************************************\n");  	/* Expand the buffers to set size */  	tracing_update_buffers(); @@ -3333,7 +3383,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  	mutex_lock(&tracing_cpumask_update_lock);  	local_irq_disable(); -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&tr->max_lock);  	for_each_tracing_cpu(cpu) {  		/*  		 * Increase/decrease the disabled counter if we are @@ -3350,7 +3400,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);  		}  	} -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&tr->max_lock);  	local_irq_enable();  	cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); @@ -3592,6 +3642,7 @@ static const char readme_msg[] =  	"  trace_options\t\t- Set format or modify how tracing happens\n"  	"\t\t\t  Disable an option by adding a suffix 'no' to the\n"  	"\t\t\t  option name\n" +	"  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"  #ifdef CONFIG_DYNAMIC_FTRACE  	"\n  available_filter_functions - list of functions that can be filtered on\n"  	"  set_ftrace_filter\t- echo function name in here to only trace these\n" @@ -3705,55 +3756,153 @@ static const struct file_operations tracing_readme_fops = {  	.llseek		= generic_file_llseek,  }; +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ +	unsigned int *ptr = v; + +	if (*pos || m->count) +		ptr++; + +	(*pos)++; + +	for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; +	     ptr++) { +		if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) +			continue; + +		return ptr; +	} + +	return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ +	void *v; +	loff_t l = 0; + +	preempt_disable(); +	arch_spin_lock(&trace_cmdline_lock); + +	v = &savedcmd->map_cmdline_to_pid[0]; +	while (l <= *pos) { +		v = saved_cmdlines_next(m, v, &l); +		if (!v) +			return NULL; +	} + +	return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ +	arch_spin_unlock(&trace_cmdline_lock); +	preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ +	char buf[TASK_COMM_LEN]; +	unsigned int *pid = v; + +	__trace_find_cmdline(*pid, buf); +	seq_printf(m, "%d %s\n", *pid, buf); +	return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { +	.start		= saved_cmdlines_start, +	.next		= saved_cmdlines_next, +	.stop		= saved_cmdlines_stop, +	.show		= saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ +	if (tracing_disabled) +		return -ENODEV; + +	return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +static const struct file_operations tracing_saved_cmdlines_fops = { +	.open		= tracing_saved_cmdlines_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +}; +  static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, -				size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, +				 size_t cnt, loff_t *ppos)  { -	char *buf_comm; -	char *file_buf; -	char *buf; -	int len = 0; -	int pid; -	int i; +	char buf[64]; +	int r; + +	arch_spin_lock(&trace_cmdline_lock); +	r = sprintf(buf, "%u\n", savedcmd->cmdline_num); +	arch_spin_unlock(&trace_cmdline_lock); + +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ +	kfree(s->saved_cmdlines); +	kfree(s->map_cmdline_to_pid); +	kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ +	struct saved_cmdlines_buffer *s, *savedcmd_temp; -	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); -	if (!file_buf) +	s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); +	if (!s)  		return -ENOMEM; -	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); -	if (!buf_comm) { -		kfree(file_buf); +	if (allocate_cmdlines_buffer(val, s) < 0) { +		kfree(s);  		return -ENOMEM;  	} -	buf = file_buf; +	arch_spin_lock(&trace_cmdline_lock); +	savedcmd_temp = savedcmd; +	savedcmd = s; +	arch_spin_unlock(&trace_cmdline_lock); +	free_saved_cmdlines_buffer(savedcmd_temp); -	for (i = 0; i < SAVED_CMDLINES; i++) { -		int r; +	return 0; +} -		pid = map_cmdline_to_pid[i]; -		if (pid == -1 || pid == NO_CMDLINE_MAP) -			continue; +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, +				  size_t cnt, loff_t *ppos) +{ +	unsigned long val; +	int ret; -		trace_find_cmdline(pid, buf_comm); -		r = sprintf(buf, "%d %s\n", pid, buf_comm); -		buf += r; -		len += r; -	} +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; -	len = simple_read_from_buffer(ubuf, cnt, ppos, -				      file_buf, len); +	/* must have at least 1 entry or less than PID_MAX_DEFAULT */ +	if (!val || val > PID_MAX_DEFAULT) +		return -EINVAL; -	kfree(file_buf); -	kfree(buf_comm); +	ret = tracing_resize_saved_cmdlines((unsigned int)val); +	if (ret < 0) +		return ret; -	return len; +	*ppos += cnt; + +	return cnt;  } -static const struct file_operations tracing_saved_cmdlines_fops = { -    .open       = tracing_open_generic, -    .read       = tracing_saved_cmdlines_read, -    .llseek	= generic_file_llseek, +static const struct file_operations tracing_saved_cmdlines_size_fops = { +	.open		= tracing_open_generic, +	.read		= tracing_saved_cmdlines_size_read, +	.write		= tracing_saved_cmdlines_size_write,  };  static ssize_t @@ -4225,25 +4374,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)  	return trace_poll(iter, filp, poll_table);  } -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - *  1) the current tracer might hold the runqueue lock when it wakes up - *     a reader, hence a deadlock (sched, function, and function graph tracers) - *  2) the function tracers, trace all functions, we don't want - *     the overhead of calling wake_up and friends - *     (and tracing them too) - * - *     Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ -	set_current_state(TASK_INTERRUPTIBLE); -	/* sleep for 100 msecs, and try again. */ -	schedule_timeout(HZ / 10); -} -  /* Must be called with trace_types_lock mutex held. */  static int tracing_wait_pipe(struct file *filp)  { @@ -4255,15 +4385,6 @@ static int tracing_wait_pipe(struct file *filp)  			return -EAGAIN;  		} -		mutex_unlock(&iter->mutex); - -		iter->trace->wait_pipe(iter); - -		mutex_lock(&iter->mutex); - -		if (signal_pending(current)) -			return -EINTR; -  		/*  		 * We block until we read something and tracing is disabled.  		 * We still block if tracing is disabled, but we have never @@ -4275,6 +4396,15 @@ static int tracing_wait_pipe(struct file *filp)  		 */  		if (!tracing_is_on() && iter->pos)  			break; + +		mutex_unlock(&iter->mutex); + +		wait_on_pipe(iter); + +		mutex_lock(&iter->mutex); + +		if (signal_pending(current)) +			return -EINTR;  	}  	return 1; @@ -5197,7 +5327,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,  				goto out_unlock;  			}  			mutex_unlock(&trace_types_lock); -			iter->trace->wait_pipe(iter); +			wait_on_pipe(iter);  			mutex_lock(&trace_types_lock);  			if (signal_pending(current)) {  				size = -EINTR; @@ -5408,7 +5538,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			goto out;  		}  		mutex_unlock(&trace_types_lock); -		iter->trace->wait_pipe(iter); +		wait_on_pipe(iter);  		mutex_lock(&trace_types_lock);  		if (signal_pending(current)) {  			ret = -EINTR; @@ -6102,6 +6232,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)  	return 0;  } +static void free_trace_buffers(struct trace_array *tr) +{ +	if (!tr) +		return; + +	if (tr->trace_buffer.buffer) { +		ring_buffer_free(tr->trace_buffer.buffer); +		tr->trace_buffer.buffer = NULL; +		free_percpu(tr->trace_buffer.data); +	} + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (tr->max_buffer.buffer) { +		ring_buffer_free(tr->max_buffer.buffer); +		tr->max_buffer.buffer = NULL; +	} +#endif +} +  static int new_instance_create(const char *name)  {  	struct trace_array *tr; @@ -6131,6 +6280,8 @@ static int new_instance_create(const char *name)  	raw_spin_lock_init(&tr->start_lock); +	tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +  	tr->current_trace = &nop_trace;  	INIT_LIST_HEAD(&tr->systems); @@ -6158,8 +6309,7 @@ static int new_instance_create(const char *name)  	return 0;   out_free_tr: -	if (tr->trace_buffer.buffer) -		ring_buffer_free(tr->trace_buffer.buffer); +	free_trace_buffers(tr);  	free_cpumask_var(tr->tracing_cpumask);  	kfree(tr->name);  	kfree(tr); @@ -6199,8 +6349,7 @@ static int instance_delete(const char *name)  	event_trace_del_tracer(tr);  	ftrace_destroy_function_files(tr);  	debugfs_remove_recursive(tr->dir); -	free_percpu(tr->trace_buffer.data); -	ring_buffer_free(tr->trace_buffer.buffer); +	free_trace_buffers(tr);  	kfree(tr->name);  	kfree(tr); @@ -6328,6 +6477,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)  	trace_create_file("tracing_on", 0644, d_tracer,  			  tr, &rb_simple_fops); +#ifdef CONFIG_TRACER_MAX_TRACE +	trace_create_file("tracing_max_latency", 0644, d_tracer, +			&tr->max_latency, &tracing_max_lat_fops); +#endif +  	if (ftrace_create_function_files(tr, d_tracer))  		WARN(1, "Could not allocate function filter files"); @@ -6353,11 +6507,6 @@ static __init int tracer_init_debugfs(void)  	init_tracer_debugfs(&global_trace, d_tracer); -#ifdef CONFIG_TRACER_MAX_TRACE -	trace_create_file("tracing_max_latency", 0644, d_tracer, -			&tracing_max_latency, &tracing_max_lat_fops); -#endif -  	trace_create_file("tracing_thresh", 0644, d_tracer,  			&tracing_thresh, &tracing_max_lat_fops); @@ -6367,6 +6516,9 @@ static __init int tracer_init_debugfs(void)  	trace_create_file("saved_cmdlines", 0444, d_tracer,  			NULL, &tracing_saved_cmdlines_fops); +	trace_create_file("saved_cmdlines_size", 0644, d_tracer, +			  NULL, &tracing_saved_cmdlines_size_fops); +  #ifdef CONFIG_DYNAMIC_FTRACE  	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,  			&ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -6603,18 +6755,19 @@ __init static int tracer_alloc_buffers(void)  	if (!temp_buffer)  		goto out_free_cpumask; +	if (trace_create_savedcmd() < 0) +		goto out_free_temp_buffer; +  	/* TODO: make the number of buffers hot pluggable with CPUS */  	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {  		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");  		WARN_ON(1); -		goto out_free_temp_buffer; +		goto out_free_savedcmd;  	}  	if (global_trace.buffer_disabled)  		tracing_off(); -	trace_init_cmdlines(); -  	if (trace_boot_clock) {  		ret = tracing_set_clock(&global_trace, trace_boot_clock);  		if (ret < 0) @@ -6629,6 +6782,10 @@ __init static int tracer_alloc_buffers(void)  	 */  	global_trace.current_trace = &nop_trace; +	global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +	ftrace_init_global_array_ops(&global_trace); +  	register_tracer(&nop_trace);  	/* All seems OK, enable tracing */ @@ -6656,13 +6813,11 @@ __init static int tracer_alloc_buffers(void)  	return 0; +out_free_savedcmd: +	free_saved_cmdlines_buffer(savedcmd);  out_free_temp_buffer:  	ring_buffer_free(temp_buffer);  out_free_cpumask: -	free_percpu(global_trace.trace_buffer.data); -#ifdef CONFIG_TRACER_MAX_TRACE -	free_percpu(global_trace.max_buffer.data); -#endif  	free_cpumask_var(global_trace.tracing_cpumask);  out_free_buffer_mask:  	free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2e29d7ba5a52..9e82551dd566 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -190,7 +190,22 @@ struct trace_array {  	 */  	struct trace_buffer	max_buffer;  	bool			allocated_snapshot; +	unsigned long		max_latency;  #endif +	/* +	 * max_lock is used to protect the swapping of buffers +	 * when taking a max snapshot. The buffers themselves are +	 * protected by per_cpu spinlocks. But the action of the swap +	 * needs its own lock. +	 * +	 * This is defined as a arch_spinlock_t in order to help +	 * with performance when lockdep debugging is enabled. +	 * +	 * It is also used in other places outside the update_max_tr +	 * so it needs to be defined outside of the +	 * CONFIG_TRACER_MAX_TRACE. +	 */ +	arch_spinlock_t		max_lock;  	int			buffer_disabled;  #ifdef CONFIG_FTRACE_SYSCALLS  	int			sys_refcount_enter; @@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)  {  	struct trace_array *tr; +	if (list_empty(ftrace_trace_arrays.prev)) +		return NULL; +  	tr = list_entry(ftrace_trace_arrays.prev,  			typeof(*tr), list);  	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); @@ -323,7 +341,6 @@ struct tracer_flags {   * @stop: called when tracing is paused (echo 0 > tracing_enabled)   * @open: called when the trace file is opened   * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe   * @close: called when the trace file is released   * @pipe_close: called when the trace_pipe file is released   * @read: override the default read callback on trace_pipe @@ -342,7 +359,6 @@ struct tracer {  	void			(*stop)(struct trace_array *tr);  	void			(*open)(struct trace_iterator *iter);  	void			(*pipe_open)(struct trace_iterator *iter); -	void			(*wait_pipe)(struct trace_iterator *iter);  	void			(*close)(struct trace_iterator *iter);  	void			(*pipe_close)(struct trace_iterator *iter);  	ssize_t			(*read)(struct trace_iterator *iter, @@ -416,13 +432,7 @@ enum {  	TRACE_FTRACE_IRQ_BIT,  	TRACE_FTRACE_SIRQ_BIT, -	/* GLOBAL_BITs must be greater than FTRACE_BITs */ -	TRACE_GLOBAL_BIT, -	TRACE_GLOBAL_NMI_BIT, -	TRACE_GLOBAL_IRQ_BIT, -	TRACE_GLOBAL_SIRQ_BIT, - -	/* INTERNAL_BITs must be greater than GLOBAL_BITs */ +	/* INTERNAL_BITs must be greater than FTRACE_BITs */  	TRACE_INTERNAL_BIT,  	TRACE_INTERNAL_NMI_BIT,  	TRACE_INTERNAL_IRQ_BIT, @@ -449,9 +459,6 @@ enum {  #define TRACE_FTRACE_START	TRACE_FTRACE_BIT  #define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_GLOBAL_START	TRACE_GLOBAL_BIT -#define TRACE_GLOBAL_MAX	((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) -  #define TRACE_LIST_START	TRACE_INTERNAL_BIT  #define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) @@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);  void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void poll_wait_pipe(struct trace_iterator *iter); -  void tracing_sched_switch_trace(struct trace_array *tr,  				struct task_struct *prev,  				struct task_struct *next, @@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);  extern unsigned long tracing_thresh;  #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; -  void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);  void update_max_tr_single(struct trace_array *tr,  			  struct task_struct *tsk, int cpu); @@ -724,6 +727,8 @@ extern unsigned long trace_flags;  #define TRACE_GRAPH_PRINT_PROC          0x8  #define TRACE_GRAPH_PRINT_DURATION      0x10  #define TRACE_GRAPH_PRINT_ABS_TIME      0x20 +#define TRACE_GRAPH_PRINT_IRQS          0x40 +#define TRACE_GRAPH_PRINT_TAIL          0x80  #define TRACE_GRAPH_PRINT_FILL_SHIFT	28  #define TRACE_GRAPH_PRINT_FILL_MASK	(0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -823,6 +828,10 @@ extern int ftrace_is_dead(void);  int ftrace_create_function_files(struct trace_array *tr,  				 struct dentry *parent);  void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void);  #else  static inline int ftrace_trace_task(struct task_struct *task)  { @@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr,  	return 0;  }  static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0)  #endif /* CONFIG_FUNCTION_TRACER */  #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000000..40a14cbcf8e0 --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,198 @@ +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ +	u64 start; +	u64 stop; +	u64 delta; +	u64 stddev; +	u64 seed; +	u64 last_seed; +	unsigned int avg; +	unsigned int std = 0; + +	/* Only run if the tracepoint is actually active */ +	if (!trace_benchmark_event_enabled()) +		return; + +	local_irq_disable(); +	start = trace_clock_local(); +	trace_benchmark_event(bm_str); +	stop = trace_clock_local(); +	local_irq_enable(); + +	bm_cnt++; + +	delta = stop - start; + +	/* +	 * The first read is cold cached, keep it separate from the +	 * other calculations. +	 */ +	if (bm_cnt == 1) { +		bm_first = delta; +		scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, +			  "first=%llu [COLD CACHED]", bm_first); +		return; +	} + +	bm_last = delta; + +	if (delta > bm_max) +		bm_max = delta; +	if (!bm_min || delta < bm_min) +		bm_min = delta; + +	/* +	 * When bm_cnt is greater than UINT_MAX, it breaks the statistics +	 * accounting. Freeze the statistics when that happens. +	 * We should have enough data for the avg and stddev anyway. +	 */ +	if (bm_cnt > UINT_MAX) { +		scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, +		    "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", +			  bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); +		return; +	} + +	bm_total += delta; +	bm_totalsq += delta * delta; + + +	if (bm_cnt > 1) { +		/* +		 * Apply Welford's method to calculate standard deviation: +		 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) +		 */ +		stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; +		do_div(stddev, (u32)bm_cnt); +		do_div(stddev, (u32)bm_cnt - 1); +	} else +		stddev = 0; + +	delta = bm_total; +	do_div(delta, bm_cnt); +	avg = delta; + +	if (stddev > 0) { +		int i = 0; +		/* +		 * stddev is the square of standard deviation but +		 * we want the actualy number. Use the average +		 * as our seed to find the std. +		 * +		 * The next try is: +		 *  x = (x + N/x) / 2 +		 * +		 * Where N is the squared number to find the square +		 * root of. +		 */ +		seed = avg; +		do { +			last_seed = seed; +			seed = stddev; +			if (!last_seed) +				break; +			do_div(seed, last_seed); +			seed += last_seed; +			do_div(seed, 2); +		} while (i++ < 10 && last_seed != seed); + +		std = seed; +	} + +	scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, +		  "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", +		  bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + +	bm_std = std; +	bm_avg = avg; +	bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ +	/* sleep a bit to make sure the tracepoint gets activated */ +	msleep(100); + +	while (!kthread_should_stop()) { + +		trace_do_benchmark(); + +		/* +		 * We don't go to sleep, but let others +		 * run as well. +		 */ +		cond_resched(); +	} + +	return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ +	bm_event_thread = kthread_run(benchmark_event_kthread, +				      NULL, "event_benchmark"); +	WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ +	if (!bm_event_thread) +		return; + +	kthread_stop(bm_event_thread); + +	strcpy(bm_str, "START"); +	bm_total = 0; +	bm_totalsq = 0; +	bm_last = 0; +	bm_max = 0; +	bm_min = 0; +	bm_cnt = 0; +	/* These don't need to be reset but reset them anyway */ +	bm_first = 0; +	bm_std = 0; +	bm_avg = 0; +	bm_stddev = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000000..3c1df1df4e29 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN		128 + +TRACE_EVENT_FN(benchmark_event, + +	TP_PROTO(const char *str), + +	TP_ARGS(str), + +	TP_STRUCT__entry( +		__array(	char,	str,	BENCHMARK_EVENT_STRLEN	) +	), + +	TP_fast_assign( +		memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); +	), + +	TP_printk("%s", __entry->str), + +	trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3ddfd8f62c05..f99e0b3bca8c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)  {  	struct trace_array *tr = top_trace_array(); +	if (!tr) +		return -ENODEV; +  	return __ftrace_set_clr_event(tr, NULL, system, event, set);  }  EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash,  	bool enable;  	int ret; +	if (!tr) +		return -ENODEV; +  	/* hash funcs only work with set_ftrace_filter */  	if (!enabled || !param)  		return -EINVAL; @@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void)  	char *token;  	int ret; +	if (!tr) +		return -ENODEV; +  	for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {  		call = *iter; @@ -2442,6 +2451,8 @@ static __init int event_trace_init(void)  	int ret;  	tr = top_trace_array(); +	if (!tr) +		return -ENODEV;  	d_tracer = tracing_init_dentry();  	if (!d_tracer) @@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void)  	int ret;  	tr = top_trace_array(); +	if (!tr) +		return;  	pr_info("Running tests on trace events:\n"); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 925f537f07d1..4747b476a030 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec)  			data->ops->func(data);  			continue;  		} -		filter = rcu_dereference(data->filter); +		filter = rcu_dereference_sched(data->filter);  		if (filter && !filter_match_preds(filter, rec))  			continue;  		if (data->cmd_ops->post_trigger) { diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b781d2be383..57f0ec962d2c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,  static void  function_stack_trace_call(unsigned long ip, unsigned long parent_ip,  			  struct ftrace_ops *op, struct pt_regs *pt_regs); -static struct ftrace_ops trace_ops; -static struct ftrace_ops trace_stack_ops;  static struct tracer_flags func_flags;  /* Our option */ @@ -58,12 +56,16 @@ int ftrace_create_function_files(struct trace_array *tr,  {  	int ret; -	/* The top level array uses the "global_ops". */ -	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { -		ret = allocate_ftrace_ops(tr); -		if (ret) -			return ret; -	} +	/* +	 * The top level array uses the "global_ops", and the files are +	 * created on boot up. +	 */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return 0; + +	ret = allocate_ftrace_ops(tr); +	if (ret) +		return ret;  	ftrace_create_filter_files(tr->ops, parent); @@ -79,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)  static int function_trace_init(struct trace_array *tr)  { -	struct ftrace_ops *ops; - -	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { -		/* There's only one global tr */ -		if (!trace_ops.private) { -			trace_ops.private = tr; -			trace_stack_ops.private = tr; -		} +	ftrace_func_t func; -		if (func_flags.val & TRACE_FUNC_OPT_STACK) -			ops = &trace_stack_ops; -		else -			ops = &trace_ops; -		tr->ops = ops; -	} else if (!tr->ops) { -		/* -		 * Instance trace_arrays get their ops allocated -		 * at instance creation. Unless it failed -		 * the allocation. -		 */ +	/* +	 * Instance trace_arrays get their ops allocated +	 * at instance creation. Unless it failed +	 * the allocation. +	 */ +	if (!tr->ops)  		return -ENOMEM; -	} + +	/* Currently only the global instance can do stack tracing */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && +	    func_flags.val & TRACE_FUNC_OPT_STACK) +		func = function_stack_trace_call; +	else +		func = function_trace_call; + +	ftrace_init_array_ops(tr, func);  	tr->trace_buffer.cpu = get_cpu();  	put_cpu(); @@ -114,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)  {  	tracing_stop_function_trace(tr);  	tracing_stop_cmdline_record(); +	ftrace_reset_array_ops(tr);  }  static void function_trace_start(struct trace_array *tr) @@ -195,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,  	local_irq_restore(flags);  } -static struct ftrace_ops trace_ops __read_mostly = -{ -	.func = function_trace_call, -	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ -	.func = function_stack_trace_call, -	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; -  static struct tracer_opt func_opts[] = {  #ifdef CONFIG_STACKTRACE  	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -244,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  		unregister_ftrace_function(tr->ops);  		if (set) { -			tr->ops = &trace_stack_ops; +			tr->ops->func = function_stack_trace_call;  			register_ftrace_function(tr->ops);  		} else { -			tr->ops = &trace_ops; +			tr->ops->func = function_trace_call;  			register_ftrace_function(tr->ops);  		} @@ -265,7 +252,6 @@ static struct tracer function_trace __tracer_data =  	.init		= function_trace_init,  	.reset		= function_trace_reset,  	.start		= function_trace_start, -	.wait_pipe	= poll_wait_pipe,  	.flags		= &func_flags,  	.set_flag	= func_set_flag,  	.allow_instances = true, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index deff11200261..4de3e57f723c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,15 +38,6 @@ struct fgraph_data {  #define TRACE_GRAPH_INDENT	2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN	0x1 -#define TRACE_GRAPH_PRINT_CPU		0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD	0x4 -#define TRACE_GRAPH_PRINT_PROC		0x8 -#define TRACE_GRAPH_PRINT_DURATION	0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME	0x20 -#define TRACE_GRAPH_PRINT_IRQS		0x40 -  static unsigned int max_depth;  static struct tracer_opt trace_opts[] = { @@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = {  	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },  	/* Display interrupts */  	{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, +	/* Display function name after trailing } */ +	{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },  	{ } /* Empty entry */  };  static struct tracer_flags tracer_flags = { -	/* Don't display overruns and proc by default */ +	/* Don't display overruns, proc, or tail by default */  	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |  	       TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,  	.opts = trace_opts @@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,  	 * If the return function does not have a matching entry,  	 * then the entry was lost. Instead of just printing  	 * the '}' and letting the user guess what function this -	 * belongs to, write out the function name. +	 * belongs to, write out the function name. Always do +	 * that if the funcgraph-tail option is enabled.  	 */ -	if (func_match) { +	if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {  		ret = trace_seq_puts(s, "}\n");  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE; @@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = {  	.pipe_open	= graph_trace_open,  	.close		= graph_trace_close,  	.pipe_close	= graph_trace_close, -	.wait_pipe	= poll_wait_pipe,  	.init		= graph_trace_init,  	.reset		= graph_trace_reset,  	.print_line	= print_graph_function, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8ff02cbb892f..9bb104f748d0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,  	atomic_dec(&data->disabled);  } - -static struct ftrace_ops trace_ops __read_mostly = -{ -	.func = irqsoff_tracer_call, -	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -};  #endif /* CONFIG_FUNCTION_TRACER */  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  	for_each_possible_cpu(cpu)  		per_cpu(tracing_cpu, cpu) = 0; -	tracing_max_latency = 0; +	tr->max_latency = 0;  	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);  	return start_irqsoff_tracer(irqsoff_trace, set); @@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)  /*   * Should this new latency be reported/recorded?   */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta)  {  	if (tracing_thresh) {  		if (delta < tracing_thresh)  			return 0;  	} else { -		if (delta <= tracing_max_latency) +		if (delta <= tr->max_latency)  			return 0;  	}  	return 1; @@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr,  	pc = preempt_count(); -	if (!report_latency(delta)) +	if (!report_latency(tr, delta))  		goto out;  	raw_spin_lock_irqsave(&max_trace_lock, flags);  	/* check if we are still the max latency */ -	if (!report_latency(delta)) +	if (!report_latency(tr, delta))  		goto out_unlock;  	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr,  	data->critical_end = parent_ip;  	if (likely(!is_tracing_stopped())) { -		tracing_max_latency = delta; +		tr->max_latency = delta;  		update_max_tr_single(tr, current, cpu);  	} @@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)  }  #endif /* CONFIG_PREEMPT_TRACER */ -static int register_irqsoff_function(int graph, int set) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set)  {  	int ret; @@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)  		ret = register_ftrace_graph(&irqsoff_graph_return,  					    &irqsoff_graph_entry);  	else -		ret = register_ftrace_function(&trace_ops); +		ret = register_ftrace_function(tr->ops);  	if (!ret)  		function_enabled = true; @@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)  	return ret;  } -static void unregister_irqsoff_function(int graph) +static void unregister_irqsoff_function(struct trace_array *tr, int graph)  {  	if (!function_enabled)  		return; @@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)  	if (graph)  		unregister_ftrace_graph();  	else -		unregister_ftrace_function(&trace_ops); +		unregister_ftrace_function(tr->ops);  	function_enabled = false;  } -static void irqsoff_function_set(int set) +static void irqsoff_function_set(struct trace_array *tr, int set)  {  	if (set) -		register_irqsoff_function(is_graph(), 1); +		register_irqsoff_function(tr, is_graph(), 1);  	else -		unregister_irqsoff_function(is_graph()); +		unregister_irqsoff_function(tr, is_graph());  }  static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)  	struct tracer *tracer = tr->current_trace;  	if (mask & TRACE_ITER_FUNCTION) -		irqsoff_function_set(set); +		irqsoff_function_set(tr, set);  	return trace_keep_overwrite(tracer, mask, set);  } @@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)  {  	int ret; -	ret = register_irqsoff_function(graph, 0); +	ret = register_irqsoff_function(tr, graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	unregister_irqsoff_function(graph); +	unregister_irqsoff_function(tr, graph);  } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr)  { +	if (irqsoff_busy) +		return -EBUSY; +  	save_flags = trace_flags;  	/* non overwrite screws up the latency tracers */  	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);  	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); -	tracing_max_latency = 0; +	tr->max_latency = 0;  	irqsoff_trace = tr;  	/* make sure that the tracer is visible */  	smp_wmb();  	tracing_reset_online_cpus(&tr->trace_buffer); -	if (start_irqsoff_tracer(tr, is_graph())) +	ftrace_init_array_ops(tr, irqsoff_tracer_call); + +	/* Only toplevel instance supports graph tracing */ +	if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && +				      is_graph())))  		printk(KERN_ERR "failed to start irqsoff tracer\n"); + +	irqsoff_busy = true; +	return 0;  }  static void irqsoff_tracer_reset(struct trace_array *tr) @@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)  	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);  	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); +	ftrace_reset_array_ops(tr); + +	irqsoff_busy = false;  }  static void irqsoff_tracer_start(struct trace_array *tr) @@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)  {  	trace_type = TRACER_IRQS_OFF; -	__irqsoff_tracer_init(tr); -	return 0; +	return __irqsoff_tracer_init(tr);  }  static struct tracer irqsoff_tracer __read_mostly =  { @@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =  #endif  	.open           = irqsoff_trace_open,  	.close          = irqsoff_trace_close, +	.allow_instances = true,  	.use_max_tr	= true,  };  # define register_irqsoff(trace) register_tracer(&trace) @@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)  {  	trace_type = TRACER_PREEMPT_OFF; -	__irqsoff_tracer_init(tr); -	return 0; +	return __irqsoff_tracer_init(tr);  }  static struct tracer preemptoff_tracer __read_mostly = @@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =  #endif  	.open		= irqsoff_trace_open,  	.close		= irqsoff_trace_close, +	.allow_instances = true,  	.use_max_tr	= true,  };  # define register_preemptoff(trace) register_tracer(&trace) @@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)  {  	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; -	__irqsoff_tracer_init(tr); -	return 0; +	return __irqsoff_tracer_init(tr);  }  static struct tracer preemptirqsoff_tracer __read_mostly = @@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =  #endif  	.open		= irqsoff_trace_open,  	.close		= irqsoff_trace_close, +	.allow_instances = true,  	.use_max_tr	= true,  }; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..ef2fba1f46b5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1377,6 +1377,9 @@ static __init int kprobe_trace_self_tests_init(void)  	struct trace_kprobe *tk;  	struct ftrace_event_file *file; +	if (tracing_is_disabled()) +		return -ENODEV; +  	target = kprobe_trace_selftest_target;  	pr_info("Testing kprobe tracing: "); diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 69a5cc94c01a..fcf0a9e48916 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly =  	.name		= "nop",  	.init		= nop_trace_init,  	.reset		= nop_trace_reset, -	.wait_pipe	= poll_wait_pipe,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest	= trace_selftest_startup_nop,  #endif diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)  EXPORT_SYMBOL_GPL(trace_seq_printf);  /** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s:		trace sequence descriptor + * @maskp:	points to an array of unsigned longs that represent a bitmask + * @nmaskbits:	The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, +		  int nmaskbits) +{ +	int len = (PAGE_SIZE - 1) - s->len; +	int ret; + +	if (s->full || !len) +		return 0; + +	ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); +	s->len += ret; + +	return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/**   * trace_seq_vprintf - sequence printing of trace information   * @s: trace sequence descriptor   * @fmt: printf format string @@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);  #endif  const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, +			 unsigned int bitmask_size) +{ +	const char *ret = p->buffer + p->len; + +	trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); +	trace_seq_putc(p, 0); + +	return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + +const char *  ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  {  	int i; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e14da5e97a69..19bd8928ce94 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,  	atomic_dec(&data->disabled);  	preempt_enable_notrace();  } - -static struct ftrace_ops trace_ops __read_mostly = -{ -	.func = wakeup_tracer_call, -	.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -};  #endif /* CONFIG_FUNCTION_TRACER */ -static int register_wakeup_function(int graph, int set) +static int register_wakeup_function(struct trace_array *tr, int graph, int set)  {  	int ret; @@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)  		ret = register_ftrace_graph(&wakeup_graph_return,  					    &wakeup_graph_entry);  	else -		ret = register_ftrace_function(&trace_ops); +		ret = register_ftrace_function(tr->ops);  	if (!ret)  		function_enabled = true; @@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)  	return ret;  } -static void unregister_wakeup_function(int graph) +static void unregister_wakeup_function(struct trace_array *tr, int graph)  {  	if (!function_enabled)  		return; @@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)  	if (graph)  		unregister_ftrace_graph();  	else -		unregister_ftrace_function(&trace_ops); +		unregister_ftrace_function(tr->ops);  	function_enabled = false;  } -static void wakeup_function_set(int set) +static void wakeup_function_set(struct trace_array *tr, int set)  {  	if (set) -		register_wakeup_function(is_graph(), 1); +		register_wakeup_function(tr, is_graph(), 1);  	else -		unregister_wakeup_function(is_graph()); +		unregister_wakeup_function(tr, is_graph());  }  static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)  	struct tracer *tracer = tr->current_trace;  	if (mask & TRACE_ITER_FUNCTION) -		wakeup_function_set(set); +		wakeup_function_set(tr, set);  	return trace_keep_overwrite(tracer, mask, set);  } -static int start_func_tracer(int graph) +static int start_func_tracer(struct trace_array *tr, int graph)  {  	int ret; -	ret = register_wakeup_function(graph, 0); +	ret = register_wakeup_function(tr, graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -203,11 +197,11 @@ static int start_func_tracer(int graph)  	return ret;  } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	unregister_wakeup_function(graph); +	unregister_wakeup_function(tr, graph);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  	if (!(is_graph() ^ set))  		return 0; -	stop_func_tracer(!set); +	stop_func_tracer(tr, !set);  	wakeup_reset(wakeup_trace); -	tracing_max_latency = 0; +	tr->max_latency = 0; -	return start_func_tracer(set); +	return start_func_tracer(tr, set);  }  static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)  /*   * Should this new latency be reported/recorded?   */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta)  {  	if (tracing_thresh) {  		if (delta < tracing_thresh)  			return 0;  	} else { -		if (delta <= tracing_max_latency) +		if (delta <= tr->max_latency)  			return 0;  	}  	return 1; @@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,  	T1 = ftrace_now(cpu);  	delta = T1-T0; -	if (!report_latency(delta)) +	if (!report_latency(wakeup_trace, delta))  		goto out_unlock;  	if (likely(!is_tracing_stopped())) { -		tracing_max_latency = delta; +		wakeup_trace->max_latency = delta;  		update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);  	} @@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)  	 */  	smp_wmb(); -	if (start_func_tracer(is_graph())) +	if (start_func_tracer(tr, is_graph()))  		printk(KERN_ERR "failed to start wakeup tracer\n");  	return; @@ -600,13 +594,15 @@ fail_deprobe:  static void stop_wakeup_tracer(struct trace_array *tr)  {  	tracer_enabled = 0; -	stop_func_tracer(is_graph()); +	stop_func_tracer(tr, is_graph());  	unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);  	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);  	unregister_trace_sched_wakeup(probe_wakeup, NULL);  	unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);  } +static bool wakeup_busy; +  static int __wakeup_tracer_init(struct trace_array *tr)  {  	save_flags = trace_flags; @@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)  	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);  	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); -	tracing_max_latency = 0; +	tr->max_latency = 0;  	wakeup_trace = tr; +	ftrace_init_array_ops(tr, wakeup_tracer_call);  	start_wakeup_tracer(tr); + +	wakeup_busy = true;  	return 0;  }  static int wakeup_tracer_init(struct trace_array *tr)  { +	if (wakeup_busy) +		return -EBUSY; +  	wakeup_dl = 0;  	wakeup_rt = 0;  	return __wakeup_tracer_init(tr); @@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)  static int wakeup_rt_tracer_init(struct trace_array *tr)  { +	if (wakeup_busy) +		return -EBUSY; +  	wakeup_dl = 0;  	wakeup_rt = 1;  	return __wakeup_tracer_init(tr); @@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)  static int wakeup_dl_tracer_init(struct trace_array *tr)  { +	if (wakeup_busy) +		return -EBUSY; +  	wakeup_dl = 1;  	wakeup_rt = 0;  	return __wakeup_tracer_init(tr); @@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)  	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);  	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); +	ftrace_reset_array_ops(tr); +	wakeup_busy = false;  }  static void wakeup_tracer_start(struct trace_array *tr) @@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =  #endif  	.open		= wakeup_trace_open,  	.close		= wakeup_trace_close, +	.allow_instances = true,  	.use_max_tr	= true,  }; @@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =  	.reset		= wakeup_tracer_reset,  	.start		= wakeup_tracer_start,  	.stop		= wakeup_tracer_stop, -	.wait_pipe	= poll_wait_pipe,  	.print_max	= true,  	.print_header	= wakeup_print_header,  	.print_line	= wakeup_print_line, @@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =  #endif  	.open		= wakeup_trace_open,  	.close		= wakeup_trace_close, +	.allow_instances = true,  	.use_max_tr	= true,  }; @@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =  	.reset		= wakeup_tracer_reset,  	.start		= wakeup_tracer_start,  	.stop		= wakeup_tracer_stop, -	.wait_pipe	= poll_wait_pipe,  	.print_max	= true,  	.print_header	= wakeup_print_header,  	.print_line	= wakeup_print_line, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974f..5ef60499dc8e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)  	/* Don't allow flipping of max traces now */  	local_irq_save(flags); -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&buf->tr->max_lock);  	cnt = ring_buffer_entries(buf->buffer); @@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)  			break;  	}  	tracing_on(); -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&buf->tr->max_lock);  	local_irq_restore(flags);  	if (count) @@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {  	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,  }; -static struct ftrace_ops test_global = { -	.func		= trace_selftest_test_global_func, -	.flags		= FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; -  static void print_counts(void)  {  	printk("(%d %d %d %d %d) ", @@ -185,7 +180,7 @@ static void reset_counts(void)  	trace_selftest_test_dyn_cnt = 0;  } -static int trace_selftest_ops(int cnt) +static int trace_selftest_ops(struct trace_array *tr, int cnt)  {  	int save_ftrace_enabled = ftrace_enabled;  	struct ftrace_ops *dyn_ops; @@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)  	register_ftrace_function(&test_probe1);  	register_ftrace_function(&test_probe2);  	register_ftrace_function(&test_probe3); -	register_ftrace_function(&test_global); +	/* First time we are running with main function */ +	if (cnt > 1) { +		ftrace_init_array_ops(tr, trace_selftest_test_global_func); +		register_ftrace_function(tr->ops); +	}  	DYN_FTRACE_TEST_NAME(); @@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)  		goto out;  	if (trace_selftest_test_probe3_cnt != 1)  		goto out; -	if (trace_selftest_test_global_cnt == 0) -		goto out; +	if (cnt > 1) { +		if (trace_selftest_test_global_cnt == 0) +			goto out; +	}  	DYN_FTRACE_TEST_NAME2(); @@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)  		goto out_free;  	if (trace_selftest_test_probe3_cnt != 3)  		goto out_free; -	if (trace_selftest_test_global_cnt == 0) -		goto out; +	if (cnt > 1) { +		if (trace_selftest_test_global_cnt == 0) +			goto out; +	}  	if (trace_selftest_test_dyn_cnt == 0)  		goto out_free; @@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)  	unregister_ftrace_function(&test_probe1);  	unregister_ftrace_function(&test_probe2);  	unregister_ftrace_function(&test_probe3); -	unregister_ftrace_function(&test_global); +	if (cnt > 1) +		unregister_ftrace_function(tr->ops); +	ftrace_reset_array_ops(tr);  	/* Make sure everything is off */  	reset_counts(); @@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)  }  /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, -					   struct trace_array *tr, -					   int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, +						  struct trace_array *tr, +						  int (*func)(void))  {  	int save_ftrace_enabled = ftrace_enabled;  	unsigned long count; @@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	}  	/* Test the ops with global tracing running */ -	ret = trace_selftest_ops(1); +	ret = trace_selftest_ops(tr, 1);  	trace->reset(tr);   out: @@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	/* Test the ops with global tracing off */  	if (!ret) -		ret = trace_selftest_ops(2); +		ret = trace_selftest_ops(tr, 2);  	return ret;  } @@ -802,7 +807,7 @@ out:  int  trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	unsigned long count;  	int ret; @@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	/* disable interrupts for a bit */  	local_irq_disable();  	udelay(100); @@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  		ret = -1;  	} -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	return ret;  } @@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  int  trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	unsigned long count;  	int ret; @@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	/* disable preemption for a bit */  	preempt_disable();  	udelay(100); @@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  		ret = -1;  	} -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	return ret;  } @@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  int  trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	unsigned long count;  	int ret; @@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	/* disable preemption and interrupts for a bit */  	preempt_disable(); @@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	}  	/* do the test by disabling interrupts first this time */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	tracing_start();  	trace->start(tr); @@ -1004,7 +1009,7 @@ out:  	tracing_start();  out_no_start:  	trace->reset(tr); -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	return ret;  } @@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)  int  trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	struct task_struct *p;  	struct completion is_ready;  	unsigned long count; @@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	while (p->on_rq) {  		/* @@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	trace->reset(tr);  	tracing_start(); -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	/* kill the thread */  	kthread_stop(p); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21b320e5d163..8a4e5cb66a4c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled;  static int last_stack_tracer_enabled; +static inline void print_max_stack(void) +{ +	long i; +	int size; + +	pr_emerg("        Depth    Size   Location    (%d entries)\n" +			   "        -----    ----   --------\n", +			   max_stack_trace.nr_entries - 1); + +	for (i = 0; i < max_stack_trace.nr_entries; i++) { +		if (stack_dump_trace[i] == ULONG_MAX) +			break; +		if (i+1 == max_stack_trace.nr_entries || +				stack_dump_trace[i+1] == ULONG_MAX) +			size = stack_dump_index[i]; +		else +			size = stack_dump_index[i] - stack_dump_index[i+1]; + +		pr_emerg("%3ld) %8d   %5d   %pS\n", i, stack_dump_index[i], +				size, (void *)stack_dump_trace[i]); +	} +} +  static inline void  check_stack(unsigned long ip, unsigned long *stack)  { -	unsigned long this_size, flags; -	unsigned long *p, *top, *start; +	unsigned long this_size, flags; unsigned long *p, *top, *start;  	static int tracer_frame;  	int frame_size = ACCESS_ONCE(tracer_frame);  	int i; @@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)  	max_stack_size = this_size; -	max_stack_trace.nr_entries	= 0; -	max_stack_trace.skip		= 3; +	max_stack_trace.nr_entries = 0; + +	if (using_ftrace_ops_list_func()) +		max_stack_trace.skip = 4; +	else +		max_stack_trace.skip = 3;  	save_stack_trace(&max_stack_trace); @@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)  			i++;  	} -	BUG_ON(current != &init_task && -		*(end_of_stack(current)) != STACK_END_MAGIC); +	if ((current != &init_task && +		*(end_of_stack(current)) != STACK_END_MAGIC)) { +		print_max_stack(); +		BUG(); +	} +   out:  	arch_spin_unlock(&max_stack_lock);  	local_irq_restore(flags); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 930e51462dc8..c082a7441345 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -732,9 +732,15 @@ static int uprobe_buffer_enable(void)  static void uprobe_buffer_disable(void)  { +	int cpu; +  	BUG_ON(!mutex_is_locked(&event_mutex));  	if (--uprobe_buffer_refcnt == 0) { +		for_each_possible_cpu(cpu) +			free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, +							     cpu)->buf); +  		free_percpu(uprobe_cpu_buffer);  		uprobe_cpu_buffer = NULL;  	} diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ac5b23cf7212..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,  		WARN_ON_ONCE(1);  		return PTR_ERR(old);  	} -	release_probes(old);  	/*  	 * rcu_assign_pointer has a smp_wmb() which makes sure that the new @@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,  	rcu_assign_pointer(tp->funcs, tp_funcs);  	if (!static_key_enabled(&tp->key))  		static_key_slow_inc(&tp->key); +	release_probes(old);  	return 0;  } @@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,  		WARN_ON_ONCE(1);  		return PTR_ERR(old);  	} -	release_probes(old);  	if (!tp_funcs) {  		/* Removed last function */ @@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,  			static_key_slow_dec(&tp->key);  	}  	rcu_assign_pointer(tp->funcs, tp_funcs); +	release_probes(old);  	return 0;  } @@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,   * tracepoint_probe_register -  Connect a probe to a tracepoint   * @tp: tracepoint   * @probe: probe handler + * @data: tracepoint data   *   * Returns 0 if ok, error value on error.   * Note: if @tp is within a module, the caller is responsible for @@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);   * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint   * @tp: tracepoint   * @probe: probe function pointer + * @data: tracepoint data   *   * Returns 0 if ok, error value on error.   */ diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);  struct user_struct root_user = {  	.__count	= ATOMIC_INIT(1),  	.processes	= ATOMIC_INIT(1), -	.files		= ATOMIC_INIT(0),  	.sigpending	= ATOMIC_INIT(0),  	.locked_shm     = 0,  	.uid		= GLOBAL_ROOT_UID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index bf71b4b2d632..fcc02560fd6b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged);  /**   *	make_kgid - Map a user-namespace gid pair into a kgid.   *	@ns:  User namespace that the gid is in - *	@uid: group identifier + *	@gid: group identifier   *   *	Maps a user-namespace gid pair into a kernel internal kgid,   *	and returns that kgid. @@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v)  	return 0;  } -static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +static void *m_start(struct seq_file *seq, loff_t *ppos, +		     struct uid_gid_map *map)  {  	struct uid_gid_extent *extent = NULL;  	loff_t pos = *ppos; @@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = {  	.show = projid_m_show,  }; -static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +static bool mappings_overlap(struct uid_gid_map *new_map, +			     struct uid_gid_extent *extent)  {  	u32 upper_first, lower_first, upper_last, lower_last;  	unsigned idx; @@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	ret = -EINVAL;  	pos = kbuf;  	new_map.nr_extents = 0; -	for (;pos; pos = next_line) { +	for (; pos; pos = next_line) {  		extent = &new_map.extent[new_map.nr_extents];  		/* Find the end of line and ensure I don't look past it */ @@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,  		/* Verify we have been given valid starting values */  		if ((extent->first == (u32) -1) || -		    (extent->lower_first == (u32) -1 )) +		    (extent->lower_first == (u32) -1))  			goto out; -		/* Verify count is not zero and does not cause the extent to wrap */ +		/* Verify count is not zero and does not cause the +		 * extent to wrap +		 */  		if ((extent->first + extent->count) <= extent->first)  			goto out; -		if ((extent->lower_first + extent->count) <= extent->lower_first) +		if ((extent->lower_first + extent->count) <= +		     extent->lower_first)  			goto out;  		/* Do the ranges in extent overlap any previous extents? */ @@ -751,7 +756,8 @@ out:  	return ret;  } -ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, +			   size_t size, loff_t *ppos)  {  	struct seq_file *seq = file->private_data;  	struct user_namespace *ns = seq->private; @@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz  			 &ns->uid_map, &ns->parent->uid_map);  } -ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, +			   size_t size, loff_t *ppos)  {  	struct seq_file *seq = file->private_data;  	struct user_namespace *ns = seq->private; @@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz  			 &ns->gid_map, &ns->parent->gid_map);  } -ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, +			      size_t size, loff_t *ppos)  {  	struct seq_file *seq = file->private_data;  	struct user_namespace *ns = seq->private; @@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t  			 &ns->projid_map, &ns->parent->projid_map);  } -static bool new_idmap_permitted(const struct file *file,  +static bool new_idmap_permitted(const struct file *file,  				struct user_namespace *ns, int cap_setid,  				struct uid_gid_map *new_map)  { @@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file,  			kuid_t uid = make_kuid(ns->parent, id);  			if (uid_eq(uid, file->f_cred->fsuid))  				return true; -		} -		else if (cap_setid == CAP_SETGID) { +		} else if (cap_setid == CAP_SETGID) {  			kgid_t gid = make_kgid(ns->parent, id);  			if (gid_eq(gid, file->f_cred->fsgid))  				return true; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@  #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write)  {  	char *which = table->data;  	struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)  	return which;  } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which)  {  	if (!write)  		up_read(&uts_sem); @@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which)   *	Special case of dostring for the UTS structure. This has locks   *	to observe. Should this be in kernel/sys.c ????   */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write,  		  void __user *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table uts_table;  	int r;  	memcpy(&uts_table, table, sizeof(uts_table));  	uts_table.data = get_uts(table, write); -	r = proc_dostring(&uts_table,write,buffer,lenp, ppos); +	r = proc_dostring(&uts_table, write, buffer, lenp, ppos);  	put_uts(table, write, uts_table.data);  	if (write) @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)  	return 0;  } -__initcall(utsname_sysctl_init); +device_initcall(utsname_sysctl_init); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e90089fd78e0..516203e665fc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -138,7 +138,11 @@ static void __touch_watchdog(void)  void touch_softlockup_watchdog(void)  { -	__this_cpu_write(watchdog_touch_ts, 0); +	/* +	 * Preemption can be enabled.  It doesn't matter which CPU's timestamp +	 * gets zeroed here, so use the raw_ operation. +	 */ +	raw_cpu_write(watchdog_touch_ts, 0);  }  EXPORT_SYMBOL(touch_softlockup_watchdog); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af30bd1..6203d2900877 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -65,15 +65,12 @@ enum {  	 * be executing on any CPU.  The pool behaves as an unbound one.  	 *  	 * Note that DISASSOCIATED should be flipped only while holding -	 * manager_mutex to avoid changing binding state while -	 * create_worker() is in progress. +	 * attach_mutex to avoid changing binding state while +	 * worker_attach_to_pool() is in progress.  	 */ -	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */  	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */ -	POOL_FREEZING		= 1 << 3,	/* freeze in progress */  	/* worker flags */ -	WORKER_STARTED		= 1 << 0,	/* started */  	WORKER_DIE		= 1 << 1,	/* die die die */  	WORKER_IDLE		= 1 << 2,	/* is idle */  	WORKER_PREP		= 1 << 3,	/* preparing to run works */ @@ -100,10 +97,10 @@ enum {  	/*  	 * Rescue workers are used only on emergencies and shared by -	 * all cpus.  Give -20. +	 * all cpus.  Give MIN_NICE.  	 */ -	RESCUER_NICE_LEVEL	= -20, -	HIGHPRI_NICE_LEVEL	= -20, +	RESCUER_NICE_LEVEL	= MIN_NICE, +	HIGHPRI_NICE_LEVEL	= MIN_NICE,  	WQ_NAME_LEN		= 24,  }; @@ -124,8 +121,7 @@ enum {   *    cpu or grabbing pool->lock is enough for read access.  If   *    POOL_DISASSOCIATED is set, it's identical to L.   * - * MG: pool->manager_mutex and pool->lock protected.  Writes require both - *     locks.  Reads can happen under either lock. + * A: pool->attach_mutex protected.   *   * PL: wq_pool_mutex protected.   * @@ -163,8 +159,11 @@ struct worker_pool {  	/* see manage_workers() for details on the two manager mutexes */  	struct mutex		manager_arb;	/* manager arbitration */ -	struct mutex		manager_mutex;	/* manager exclusion */ -	struct idr		worker_idr;	/* MG: worker IDs and iteration */ +	struct mutex		attach_mutex;	/* attach/detach exclusion */ +	struct list_head	workers;	/* A: attached workers */ +	struct completion	*detach_completion; /* all workers detached */ + +	struct ida		worker_ida;	/* worker IDs for task name */  	struct workqueue_attrs	*attrs;		/* I: worker attributes */  	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */ @@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,  			   lockdep_is_held(&wq->mutex),			\  			   "sched RCU or wq->mutex should be held") -#ifdef CONFIG_LOCKDEP -#define assert_manager_or_pool_lock(pool)				\ -	WARN_ONCE(debug_locks &&					\ -		  !lockdep_is_held(&(pool)->manager_mutex) &&		\ -		  !lockdep_is_held(&(pool)->lock),			\ -		  "pool->manager_mutex or ->lock should be held") -#else -#define assert_manager_or_pool_lock(pool)	do { } while (0) -#endif -  #define for_each_cpu_worker_pool(pool, cpu)				\  	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\  	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,  /**   * for_each_pool_worker - iterate through all workers of a worker_pool   * @worker: iteration cursor - * @wi: integer used for iteration   * @pool: worker_pool to iterate workers of   * - * This must be called with either @pool->manager_mutex or ->lock held. + * This must be called with @pool->attach_mutex.   *   * The if/else clause exists only for the lockdep assertion and can be   * ignored.   */ -#define for_each_pool_worker(worker, wi, pool)				\ -	idr_for_each_entry(&(pool)->worker_idr, (worker), (wi))		\ -		if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ +#define for_each_pool_worker(worker, pool)				\ +	list_for_each_entry((worker), &(pool)->workers, node)		\ +		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \  		else  /** @@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)  	return need_more_worker(pool) && !may_start_working(pool);  } -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct worker_pool *pool) -{ -	return need_to_create_worker(pool) || -		(pool->flags & POOL_MANAGE_WORKERS); -} -  /* Do we have too many workers and should some go away? */  static bool too_many_workers(struct worker_pool *pool)  { @@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)   * Wake up functions.   */ -/* Return the first worker.  Safe with preemption disabled */ -static struct worker *first_worker(struct worker_pool *pool) +/* Return the first idle worker.  Safe with preemption disabled */ +static struct worker *first_idle_worker(struct worker_pool *pool)  {  	if (unlikely(list_empty(&pool->idle_list)))  		return NULL; @@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)   */  static void wake_up_worker(struct worker_pool *pool)  { -	struct worker *worker = first_worker(pool); +	struct worker *worker = first_idle_worker(pool);  	if (likely(worker))  		wake_up_process(worker->task); @@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)  	 */  	if (atomic_dec_and_test(&pool->nr_running) &&  	    !list_empty(&pool->worklist)) -		to_wakeup = first_worker(pool); +		to_wakeup = first_idle_worker(pool);  	return to_wakeup ? to_wakeup->task : NULL;  } @@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)  	list_del_init(&worker->entry);  } -/** - * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it - * @pool: target worker_pool - * - * Bind %current to the cpu of @pool if it is associated and lock @pool. - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by unbound workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online.  kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and pool may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks pool and verifies the - * binding against %POOL_DISASSOCIATED which is set during - * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker - * enters idle state or fetches works without dropping lock, it can - * guarantee the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep.  Called without any lock but returns with pool->lock - * held. - * - * Return: - * %true if the associated pool is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker_pool *pool) -__acquires(&pool->lock) -{ -	while (true) { -		/* -		 * The following call may fail, succeed or succeed -		 * without actually migrating the task to the cpu if -		 * it races with cpu hotunplug operation.  Verify -		 * against POOL_DISASSOCIATED. -		 */ -		if (!(pool->flags & POOL_DISASSOCIATED)) -			set_cpus_allowed_ptr(current, pool->attrs->cpumask); - -		spin_lock_irq(&pool->lock); -		if (pool->flags & POOL_DISASSOCIATED) -			return false; -		if (task_cpu(current) == pool->cpu && -		    cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) -			return true; -		spin_unlock_irq(&pool->lock); - -		/* -		 * We've raced with CPU hot[un]plug.  Give it a breather -		 * and retry migration.  cond_resched() is required here; -		 * otherwise, we might deadlock against cpu_stop trying to -		 * bring down the CPU on non-preemptive kernel. -		 */ -		cpu_relax(); -		cond_resched(); -	} -} -  static struct worker *alloc_worker(void)  {  	struct worker *worker; @@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void)  	if (worker) {  		INIT_LIST_HEAD(&worker->entry);  		INIT_LIST_HEAD(&worker->scheduled); +		INIT_LIST_HEAD(&worker->node);  		/* on creation a worker is in !idle && prep state */  		worker->flags = WORKER_PREP;  	} @@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void)  }  /** + * worker_attach_to_pool() - attach a worker to a pool + * @worker: worker to be attached + * @pool: the target pool + * + * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and + * cpu-binding of @worker are kept coordinated with the pool across + * cpu-[un]hotplugs. + */ +static void worker_attach_to_pool(struct worker *worker, +				   struct worker_pool *pool) +{ +	mutex_lock(&pool->attach_mutex); + +	/* +	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any +	 * online CPUs.  It'll be re-applied when any of the CPUs come up. +	 */ +	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + +	/* +	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains +	 * stable across this function.  See the comments above the +	 * flag definition for details. +	 */ +	if (pool->flags & POOL_DISASSOCIATED) +		worker->flags |= WORKER_UNBOUND; + +	list_add_tail(&worker->node, &pool->workers); + +	mutex_unlock(&pool->attach_mutex); +} + +/** + * worker_detach_from_pool() - detach a worker from its pool + * @worker: worker which is attached to its pool + * @pool: the pool @worker is attached to + * + * Undo the attaching which had been done in worker_attach_to_pool().  The + * caller worker shouldn't access to the pool after detached except it has + * other reference to the pool. + */ +static void worker_detach_from_pool(struct worker *worker, +				    struct worker_pool *pool) +{ +	struct completion *detach_completion = NULL; + +	mutex_lock(&pool->attach_mutex); +	list_del(&worker->node); +	if (list_empty(&pool->workers)) +		detach_completion = pool->detach_completion; +	mutex_unlock(&pool->attach_mutex); + +	if (detach_completion) +		complete(detach_completion); +} + +/**   * create_worker - create a new workqueue worker   * @pool: pool the new worker will belong to   * - * Create a new worker which is bound to @pool.  The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). + * Create a new worker which is attached to @pool.  The new worker must be + * started by start_worker().   *   * CONTEXT:   * Might sleep.  Does GFP_KERNEL allocations. @@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)  	int id = -1;  	char id_buf[16]; -	lockdep_assert_held(&pool->manager_mutex); - -	/* -	 * ID is needed to determine kthread name.  Allocate ID first -	 * without installing the pointer. -	 */ -	idr_preload(GFP_KERNEL); -	spin_lock_irq(&pool->lock); - -	id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); - -	spin_unlock_irq(&pool->lock); -	idr_preload_end(); +	/* ID is needed to determine kthread name */ +	id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);  	if (id < 0)  		goto fail; @@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool)  	/* prevent userland from meddling with cpumask of workqueue workers */  	worker->task->flags |= PF_NO_SETAFFINITY; -	/* -	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any -	 * online CPUs.  It'll be re-applied when any of the CPUs come up. -	 */ -	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - -	/* -	 * The caller is responsible for ensuring %POOL_DISASSOCIATED -	 * remains stable across this function.  See the comments above the -	 * flag definition for details. -	 */ -	if (pool->flags & POOL_DISASSOCIATED) -		worker->flags |= WORKER_UNBOUND; - -	/* successful, commit the pointer to idr */ -	spin_lock_irq(&pool->lock); -	idr_replace(&pool->worker_idr, worker, worker->id); -	spin_unlock_irq(&pool->lock); +	/* successful, attach the worker to the pool */ +	worker_attach_to_pool(worker, pool);  	return worker;  fail: -	if (id >= 0) { -		spin_lock_irq(&pool->lock); -		idr_remove(&pool->worker_idr, id); -		spin_unlock_irq(&pool->lock); -	} +	if (id >= 0) +		ida_simple_remove(&pool->worker_ida, id);  	kfree(worker);  	return NULL;  } @@ -1800,7 +1744,6 @@ fail:   */  static void start_worker(struct worker *worker)  { -	worker->flags |= WORKER_STARTED;  	worker->pool->nr_workers++;  	worker_enter_idle(worker);  	wake_up_process(worker->task); @@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)  {  	struct worker *worker; -	mutex_lock(&pool->manager_mutex); -  	worker = create_worker(pool);  	if (worker) {  		spin_lock_irq(&pool->lock); @@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)  		spin_unlock_irq(&pool->lock);  	} -	mutex_unlock(&pool->manager_mutex); -  	return worker ? 0 : -ENOMEM;  } @@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)   * destroy_worker - destroy a workqueue worker   * @worker: worker to be destroyed   * - * Destroy @worker and adjust @pool stats accordingly. + * Destroy @worker and adjust @pool stats accordingly.  The worker should + * be idle.   *   * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock).   */  static void destroy_worker(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	lockdep_assert_held(&pool->manager_mutex);  	lockdep_assert_held(&pool->lock);  	/* sanity check frenzy */  	if (WARN_ON(worker->current_work) || -	    WARN_ON(!list_empty(&worker->scheduled))) +	    WARN_ON(!list_empty(&worker->scheduled)) || +	    WARN_ON(!(worker->flags & WORKER_IDLE)))  		return; -	if (worker->flags & WORKER_STARTED) -		pool->nr_workers--; -	if (worker->flags & WORKER_IDLE) -		pool->nr_idle--; - -	/* -	 * Once WORKER_DIE is set, the kworker may destroy itself at any -	 * point.  Pin to ensure the task stays until we're done with it. -	 */ -	get_task_struct(worker->task); +	pool->nr_workers--; +	pool->nr_idle--;  	list_del_init(&worker->entry);  	worker->flags |= WORKER_DIE; - -	idr_remove(&pool->worker_idr, worker->id); - -	spin_unlock_irq(&pool->lock); - -	kthread_stop(worker->task); -	put_task_struct(worker->task); -	kfree(worker); - -	spin_lock_irq(&pool->lock); +	wake_up_process(worker->task);  }  static void idle_worker_timeout(unsigned long __pool) @@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)  	spin_lock_irq(&pool->lock); -	if (too_many_workers(pool)) { +	while (too_many_workers(pool)) {  		struct worker *worker;  		unsigned long expires; @@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)  		worker = list_entry(pool->idle_list.prev, struct worker, entry);  		expires = worker->last_active + IDLE_WORKER_TIMEOUT; -		if (time_before(jiffies, expires)) +		if (time_before(jiffies, expires)) {  			mod_timer(&pool->idle_timer, expires); -		else { -			/* it's been idle for too long, wake up manager */ -			pool->flags |= POOL_MANAGE_WORKERS; -			wake_up_worker(pool); +			break;  		} + +		destroy_worker(worker);  	}  	spin_unlock_irq(&pool->lock); @@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work)  	/* mayday mayday mayday */  	if (list_empty(&pwq->mayday_node)) { +		/* +		 * If @pwq is for an unbound wq, its base ref may be put at +		 * any time due to an attribute change.  Pin @pwq until the +		 * rescuer is done with it. +		 */ +		get_pwq(pwq);  		list_add_tail(&pwq->mayday_node, &wq->maydays);  		wake_up_process(wq->rescuer->task);  	} @@ -2011,44 +1939,6 @@ restart:  }  /** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @pool: pool to destroy workers for - * - * Destroy @pool workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times.  Called only from manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. - */ -static bool maybe_destroy_workers(struct worker_pool *pool) -{ -	bool ret = false; - -	while (too_many_workers(pool)) { -		struct worker *worker; -		unsigned long expires; - -		worker = list_entry(pool->idle_list.prev, struct worker, entry); -		expires = worker->last_active + IDLE_WORKER_TIMEOUT; - -		if (time_before(jiffies, expires)) { -			mod_timer(&pool->idle_timer, expires); -			break; -		} - -		destroy_worker(worker); -		ret = true; -	} - -	return ret; -} - -/**   * manage_workers - manage worker pool   * @worker: self   * @@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker)  	bool ret = false;  	/* -	 * Managership is governed by two mutexes - manager_arb and -	 * manager_mutex.  manager_arb handles arbitration of manager role.  	 * Anyone who successfully grabs manager_arb wins the arbitration  	 * and becomes the manager.  mutex_trylock() on pool->manager_arb  	 * failure while holding pool->lock reliably indicates that someone @@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker)  	 * grabbing manager_arb is responsible for actually performing  	 * manager duties.  If manager_arb is grabbed and released without  	 * actual management, the pool may stall indefinitely. -	 * -	 * manager_mutex is used for exclusion of actual management -	 * operations.  The holder of manager_mutex can be sure that none -	 * of management operations, including creation and destruction of -	 * workers, won't take place until the mutex is released.  Because -	 * manager_mutex doesn't interfere with manager role arbitration, -	 * it is guaranteed that the pool's management, while may be -	 * delayed, won't be disturbed by someone else grabbing -	 * manager_mutex.  	 */  	if (!mutex_trylock(&pool->manager_arb))  		return ret; -	/* -	 * With manager arbitration won, manager_mutex would be free in -	 * most cases.  trylock first without dropping @pool->lock. -	 */ -	if (unlikely(!mutex_trylock(&pool->manager_mutex))) { -		spin_unlock_irq(&pool->lock); -		mutex_lock(&pool->manager_mutex); -		spin_lock_irq(&pool->lock); -		ret = true; -	} - -	pool->flags &= ~POOL_MANAGE_WORKERS; - -	/* -	 * Destroy and then create so that may_start_working() is true -	 * on return. -	 */ -	ret |= maybe_destroy_workers(pool);  	ret |= maybe_create_worker(pool); -	mutex_unlock(&pool->manager_mutex);  	mutex_unlock(&pool->manager_arb);  	return ret;  } @@ -2308,6 +2168,11 @@ woke_up:  		spin_unlock_irq(&pool->lock);  		WARN_ON_ONCE(!list_empty(&worker->entry));  		worker->task->flags &= ~PF_WQ_WORKER; + +		set_task_comm(worker->task, "kworker/dying"); +		ida_simple_remove(&pool->worker_ida, worker->id); +		worker_detach_from_pool(worker, pool); +		kfree(worker);  		return 0;  	} @@ -2355,9 +2220,6 @@ recheck:  	worker_set_flags(worker, WORKER_PREP, false);  sleep: -	if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) -		goto recheck; -  	/*  	 * pool->lock is held and there's no work to process and no need to  	 * manage, sleep.  Workers are woken up only while holding @@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer)  	struct worker *rescuer = __rescuer;  	struct workqueue_struct *wq = rescuer->rescue_wq;  	struct list_head *scheduled = &rescuer->scheduled; +	bool should_stop;  	set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer)  repeat:  	set_current_state(TASK_INTERRUPTIBLE); -	if (kthread_should_stop()) { -		__set_current_state(TASK_RUNNING); -		rescuer->task->flags &= ~PF_WQ_WORKER; -		return 0; -	} +	/* +	 * By the time the rescuer is requested to stop, the workqueue +	 * shouldn't have any work pending, but @wq->maydays may still have +	 * pwq(s) queued.  This can happen by non-rescuer workers consuming +	 * all the work items before the rescuer got to them.  Go through +	 * @wq->maydays processing before acting on should_stop so that the +	 * list is always empty on exit. +	 */ +	should_stop = kthread_should_stop();  	/* see whether any pwq is asking for help */  	spin_lock_irq(&wq_mayday_lock); @@ -2429,8 +2296,9 @@ repeat:  		spin_unlock_irq(&wq_mayday_lock); -		/* migrate to the target cpu if possible */ -		worker_maybe_bind_and_lock(pool); +		worker_attach_to_pool(rescuer, pool); + +		spin_lock_irq(&pool->lock);  		rescuer->pool = pool;  		/* @@ -2443,6 +2311,17 @@ repeat:  				move_linked_works(work, scheduled, &n);  		process_scheduled_works(rescuer); +		spin_unlock_irq(&pool->lock); + +		worker_detach_from_pool(rescuer, pool); + +		spin_lock_irq(&pool->lock); + +		/* +		 * Put the reference grabbed by send_mayday().  @pool won't +		 * go away while we're holding its lock. +		 */ +		put_pwq(pwq);  		/*  		 * Leave this pool.  If keep_working() is %true, notify a @@ -2459,6 +2338,12 @@ repeat:  	spin_unlock_irq(&wq_mayday_lock); +	if (should_stop) { +		__set_current_state(TASK_RUNNING); +		rescuer->task->flags &= ~PF_WQ_WORKER; +		return 0; +	} +  	/* rescuers should never participate in concurrency management */  	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));  	schedule(); @@ -3527,9 +3412,10 @@ static int init_worker_pool(struct worker_pool *pool)  		    (unsigned long)pool);  	mutex_init(&pool->manager_arb); -	mutex_init(&pool->manager_mutex); -	idr_init(&pool->worker_idr); +	mutex_init(&pool->attach_mutex); +	INIT_LIST_HEAD(&pool->workers); +	ida_init(&pool->worker_ida);  	INIT_HLIST_NODE(&pool->hash_node);  	pool->refcnt = 1; @@ -3544,7 +3430,7 @@ static void rcu_free_pool(struct rcu_head *rcu)  {  	struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); -	idr_destroy(&pool->worker_idr); +	ida_destroy(&pool->worker_ida);  	free_workqueue_attrs(pool->attrs);  	kfree(pool);  } @@ -3562,6 +3448,7 @@ static void rcu_free_pool(struct rcu_head *rcu)   */  static void put_unbound_pool(struct worker_pool *pool)  { +	DECLARE_COMPLETION_ONSTACK(detach_completion);  	struct worker *worker;  	lockdep_assert_held(&wq_pool_mutex); @@ -3582,18 +3469,24 @@ static void put_unbound_pool(struct worker_pool *pool)  	/*  	 * Become the manager and destroy all workers.  Grabbing  	 * manager_arb prevents @pool's workers from blocking on -	 * manager_mutex. +	 * attach_mutex.  	 */  	mutex_lock(&pool->manager_arb); -	mutex_lock(&pool->manager_mutex); -	spin_lock_irq(&pool->lock); -	while ((worker = first_worker(pool))) +	spin_lock_irq(&pool->lock); +	while ((worker = first_idle_worker(pool)))  		destroy_worker(worker);  	WARN_ON(pool->nr_workers || pool->nr_idle); -  	spin_unlock_irq(&pool->lock); -	mutex_unlock(&pool->manager_mutex); + +	mutex_lock(&pool->attach_mutex); +	if (!list_empty(&pool->workers)) +		pool->detach_completion = &detach_completion; +	mutex_unlock(&pool->attach_mutex); + +	if (pool->detach_completion) +		wait_for_completion(pool->detach_completion); +  	mutex_unlock(&pool->manager_arb);  	/* shut down the timers */ @@ -3639,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)  	if (!pool || init_worker_pool(pool) < 0)  		goto fail; -	if (workqueue_freezing) -		pool->flags |= POOL_FREEZING; -  	lockdep_set_subclass(&pool->lock, 1);	/* see put_pwq() */  	copy_workqueue_attrs(pool->attrs, attrs); @@ -3748,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)  	spin_lock_irq(&pwq->pool->lock); -	if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { +	/* +	 * During [un]freezing, the caller is responsible for ensuring that +	 * this function is called at least once after @workqueue_freezing +	 * is updated and visible. +	 */ +	if (!freezable || !workqueue_freezing) {  		pwq->max_active = wq->saved_max_active;  		while (!list_empty(&pwq->delayed_works) && @@ -4080,17 +3975,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,  	 * Let's determine what needs to be done.  If the target cpumask is  	 * different from wq's, we need to compare it to @pwq's and create  	 * a new one if they don't match.  If the target cpumask equals -	 * wq's, the default pwq should be used.  If @pwq is already the -	 * default one, nothing to do; otherwise, install the default one. +	 * wq's, the default pwq should be used.  	 */  	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {  		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))  			goto out_unlock;  	} else { -		if (pwq == wq->dfl_pwq) -			goto out_unlock; -		else -			goto use_dfl_pwq; +		goto use_dfl_pwq;  	}  	mutex_unlock(&wq->mutex); @@ -4098,9 +3989,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,  	/* create a new pwq */  	pwq = alloc_unbound_pwq(wq, target_attrs);  	if (!pwq) { -		pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", -			   wq->name); -		goto out_unlock; +		pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", +			wq->name); +		mutex_lock(&wq->mutex); +		goto use_dfl_pwq;  	}  	/* @@ -4575,28 +4467,27 @@ static void wq_unbind_fn(struct work_struct *work)  	int cpu = smp_processor_id();  	struct worker_pool *pool;  	struct worker *worker; -	int wi;  	for_each_cpu_worker_pool(pool, cpu) {  		WARN_ON_ONCE(cpu != smp_processor_id()); -		mutex_lock(&pool->manager_mutex); +		mutex_lock(&pool->attach_mutex);  		spin_lock_irq(&pool->lock);  		/* -		 * We've blocked all manager operations.  Make all workers +		 * We've blocked all attach/detach operations. Make all workers  		 * unbound and set DISASSOCIATED.  Before this, all workers  		 * except for the ones which are still executing works from  		 * before the last CPU down must be on the cpu.  After  		 * this, they may become diasporas.  		 */ -		for_each_pool_worker(worker, wi, pool) +		for_each_pool_worker(worker, pool)  			worker->flags |= WORKER_UNBOUND;  		pool->flags |= POOL_DISASSOCIATED;  		spin_unlock_irq(&pool->lock); -		mutex_unlock(&pool->manager_mutex); +		mutex_unlock(&pool->attach_mutex);  		/*  		 * Call schedule() so that we cross rq->lock and thus can @@ -4636,9 +4527,8 @@ static void wq_unbind_fn(struct work_struct *work)  static void rebind_workers(struct worker_pool *pool)  {  	struct worker *worker; -	int wi; -	lockdep_assert_held(&pool->manager_mutex); +	lockdep_assert_held(&pool->attach_mutex);  	/*  	 * Restore CPU affinity of all workers.  As all idle workers should @@ -4647,13 +4537,13 @@ static void rebind_workers(struct worker_pool *pool)  	 * of all workers first and then clear UNBOUND.  As we're called  	 * from CPU_ONLINE, the following shouldn't fail.  	 */ -	for_each_pool_worker(worker, wi, pool) +	for_each_pool_worker(worker, pool)  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,  						  pool->attrs->cpumask) < 0);  	spin_lock_irq(&pool->lock); -	for_each_pool_worker(worker, wi, pool) { +	for_each_pool_worker(worker, pool) {  		unsigned int worker_flags = worker->flags;  		/* @@ -4705,9 +4595,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)  {  	static cpumask_t cpumask;  	struct worker *worker; -	int wi; -	lockdep_assert_held(&pool->manager_mutex); +	lockdep_assert_held(&pool->attach_mutex);  	/* is @cpu allowed for @pool? */  	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4719,7 +4608,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)  		return;  	/* as we're called from CPU_ONLINE, the following shouldn't fail */ -	for_each_pool_worker(worker, wi, pool) +	for_each_pool_worker(worker, pool)  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,  						  pool->attrs->cpumask) < 0);  } @@ -4752,7 +4641,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,  		mutex_lock(&wq_pool_mutex);  		for_each_pool(pool, pi) { -			mutex_lock(&pool->manager_mutex); +			mutex_lock(&pool->attach_mutex);  			if (pool->cpu == cpu) {  				spin_lock_irq(&pool->lock); @@ -4764,7 +4653,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,  				restore_unbound_workers_cpumask(pool, cpu);  			} -			mutex_unlock(&pool->manager_mutex); +			mutex_unlock(&pool->attach_mutex);  		}  		/* update NUMA affinity of unbound workqueues */ @@ -4863,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);   */  void freeze_workqueues_begin(void)  { -	struct worker_pool *pool;  	struct workqueue_struct *wq;  	struct pool_workqueue *pwq; -	int pi;  	mutex_lock(&wq_pool_mutex);  	WARN_ON_ONCE(workqueue_freezing);  	workqueue_freezing = true; -	/* set FREEZING */ -	for_each_pool(pool, pi) { -		spin_lock_irq(&pool->lock); -		WARN_ON_ONCE(pool->flags & POOL_FREEZING); -		pool->flags |= POOL_FREEZING; -		spin_unlock_irq(&pool->lock); -	} -  	list_for_each_entry(wq, &workqueues, list) {  		mutex_lock(&wq->mutex);  		for_each_pwq(pwq, wq) @@ -4950,21 +4829,13 @@ void thaw_workqueues(void)  {  	struct workqueue_struct *wq;  	struct pool_workqueue *pwq; -	struct worker_pool *pool; -	int pi;  	mutex_lock(&wq_pool_mutex);  	if (!workqueue_freezing)  		goto out_unlock; -	/* clear FREEZING */ -	for_each_pool(pool, pi) { -		spin_lock_irq(&pool->lock); -		WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); -		pool->flags &= ~POOL_FREEZING; -		spin_unlock_irq(&pool->lock); -	} +	workqueue_freezing = false;  	/* restore max_active and repopulate worklist */  	list_for_each_entry(wq, &workqueues, list) { @@ -4974,7 +4845,6 @@ void thaw_workqueues(void)  		mutex_unlock(&wq->mutex);  	} -	workqueue_freezing = false;  out_unlock:  	mutex_unlock(&wq_pool_mutex);  } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1a..45215870ac6c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,6 +37,8 @@ struct worker {  	struct task_struct	*task;		/* I: worker task */  	struct worker_pool	*pool;		/* I: the associated pool */  						/* L: for rescuers */ +	struct list_head	node;		/* A: anchored at pool->workers */ +						/* A: runs through worker->node */  	unsigned long		last_active;	/* L: last active timestamp */  	unsigned int		flags;		/* X: flags */  | 

