diff options
Diffstat (limited to 'kernel')
32 files changed, 404 insertions, 238 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79f1fc9..2c9eae6ad970 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,7 +46,6 @@  #include <linux/slab.h>  #include <linux/spinlock.h>  #include <linux/rwsem.h> -#include <linux/percpu-rwsem.h>  #include <linux/string.h>  #include <linux/sort.h>  #include <linux/kmod.h> @@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);   */  static DEFINE_SPINLOCK(release_agent_path_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; -  #define cgroup_assert_mutex_or_rcu_locked()				\  	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\  			   !lockdep_is_held(&cgroup_mutex),		\ @@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	return cset;  } +void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ +	down_read(&tsk->signal->group_rwsem); +} + +void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ +	up_read(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_lock - lock threadgroup + * @tsk: member task of the threadgroup to lock + * + * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter + * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or + * change ->group_leader/pid.  This is useful for cases where the threadgroup + * needs to stay stable across blockable operations. + * + * fork and exit explicitly call threadgroup_change_{begin|end}() for + * synchronization.  While held, no new task will be added to threadgroup + * and no existing live task will have its PF_EXITING set. + * + * de_thread() does threadgroup_change_{begin|end}() when a non-leader + * sub-thread becomes a new leader. + */ +static void threadgroup_lock(struct task_struct *tsk) +{ +	down_write(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_unlock - unlock threadgroup + * @tsk: member task of the threadgroup to unlock + * + * Reverse threadgroup_lock(). + */ +static inline void threadgroup_unlock(struct task_struct *tsk) +{ +	up_write(&tsk->signal->group_rwsem); +} +  static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)  {  	struct cgroup *root_cgrp = kf_root->kn->priv; @@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,  	lockdep_assert_held(&css_set_rwsem);  	/* -	 * We are synchronized through cgroup_threadgroup_rwsem against -	 * PF_EXITING setting such that we can't race against cgroup_exit() -	 * changing the css_set to init_css_set and dropping the old one. +	 * We are synchronized through threadgroup_lock() against PF_EXITING +	 * setting such that we can't race against cgroup_exit() changing the +	 * css_set to init_css_set and dropping the old one.  	 */  	WARN_ON_ONCE(tsk->flags & PF_EXITING);  	old_cset = task_css_set(tsk); @@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)   * @src_cset and add it to @preloaded_csets, which should later be cleaned   * up by cgroup_migrate_finish().   * - * This function may be called without holding cgroup_threadgroup_rwsem - * even if the target is a process.  Threads may be created and destroyed - * but as long as cgroup_mutex is not dropped, no new css_set can be put - * into play and the preloaded css_sets are guaranteed to cover all - * migrations. + * This function may be called without holding threadgroup_lock even if the + * target is a process.  Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations.   */  static void cgroup_migrate_add_src(struct css_set *src_cset,  				   struct cgroup *dst_cgrp, @@ -2240,7 +2278,7 @@ err:   * @threadgroup: whether @leader points to the whole process or a single task   *   * Migrate a process or task denoted by @leader to @cgrp.  If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem.  The + * process, the caller must be holding threadgroup_lock of @leader.  The   * caller is also responsible for invoking cgroup_migrate_add_src() and   * cgroup_migrate_prepare_dst() on the targets before invoking this   * function and following up with cgroup_migrate_finish(). @@ -2368,7 +2406,7 @@ out_release_tset:   * @leader: the task or the leader of the threadgroup to be attached   * @threadgroup: attach the whole threadgroup?   * - * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. + * Call holding cgroup_mutex and threadgroup_lock of @leader.   */  static int cgroup_attach_task(struct cgroup *dst_cgrp,  			      struct task_struct *leader, bool threadgroup) @@ -2460,13 +2498,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,  	if (!cgrp)  		return -ENODEV; -	percpu_down_write(&cgroup_threadgroup_rwsem); +retry_find_task:  	rcu_read_lock();  	if (pid) {  		tsk = find_task_by_vpid(pid);  		if (!tsk) { +			rcu_read_unlock();  			ret = -ESRCH; -			goto out_unlock_rcu; +			goto out_unlock_cgroup;  		}  	} else {  		tsk = current; @@ -2482,23 +2521,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,  	 */  	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {  		ret = -EINVAL; -		goto out_unlock_rcu; +		rcu_read_unlock(); +		goto out_unlock_cgroup;  	}  	get_task_struct(tsk);  	rcu_read_unlock(); +	threadgroup_lock(tsk); +	if (threadgroup) { +		if (!thread_group_leader(tsk)) { +			/* +			 * a race with de_thread from another thread's exec() +			 * may strip us of our leadership, if this happens, +			 * there is no choice but to throw this task away and +			 * try again; this is +			 * "double-double-toil-and-trouble-check locking". +			 */ +			threadgroup_unlock(tsk); +			put_task_struct(tsk); +			goto retry_find_task; +		} +	} +  	ret = cgroup_procs_write_permission(tsk, cgrp, of);  	if (!ret)  		ret = cgroup_attach_task(cgrp, tsk, threadgroup); -	put_task_struct(tsk); -	goto out_unlock_threadgroup; +	threadgroup_unlock(tsk); -out_unlock_rcu: -	rcu_read_unlock(); -out_unlock_threadgroup: -	percpu_up_write(&cgroup_threadgroup_rwsem); +	put_task_struct(tsk); +out_unlock_cgroup:  	cgroup_kn_unlock(of->kn);  	return ret ?: nbytes;  } @@ -2643,8 +2696,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  	lockdep_assert_held(&cgroup_mutex); -	percpu_down_write(&cgroup_threadgroup_rwsem); -  	/* look up all csses currently attached to @cgrp's subtree */  	down_read(&css_set_rwsem);  	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2700,8 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  				goto out_finish;  			last_task = task; +			threadgroup_lock(task); +			/* raced against de_thread() from another thread? */ +			if (!thread_group_leader(task)) { +				threadgroup_unlock(task); +				put_task_struct(task); +				continue; +			} +  			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); +			threadgroup_unlock(task);  			put_task_struct(task);  			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -2711,7 +2771,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  out_finish:  	cgroup_migrate_finish(&preloaded_csets); -	percpu_up_write(&cgroup_threadgroup_rwsem);  	return ret;  } @@ -5024,7 +5083,6 @@ int __init cgroup_init(void)  	unsigned long key;  	int ssid, err; -	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69c4299..b11756f9b6dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)  					      PERF_EVENT_STATE_INACTIVE;  } -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) +static void __perf_event_read_size(struct perf_event *event, int nr_siblings)  {  	int entry = sizeof(u64); /* value */  	int size = 0; @@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)  		entry += sizeof(u64);  	if (event->attr.read_format & PERF_FORMAT_GROUP) { -		nr += event->group_leader->nr_siblings; +		nr += nr_siblings;  		size += sizeof(u64);  	} @@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)  	event->read_size = size;  } -static void perf_event__header_size(struct perf_event *event) +static void __perf_event_header_size(struct perf_event *event, u64 sample_type)  {  	struct perf_sample_data *data; -	u64 sample_type = event->attr.sample_type;  	u16 size = 0; -	perf_event__read_size(event); -  	if (sample_type & PERF_SAMPLE_IP)  		size += sizeof(data->ip); @@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)  	event->header_size = size;  } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__header_size(struct perf_event *event) +{ +	__perf_event_read_size(event, +			       event->group_leader->nr_siblings); +	__perf_event_header_size(event, event->attr.sample_type); +} +  static void perf_event__id_header_size(struct perf_event *event)  {  	struct perf_sample_data *data; @@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)  	event->id_header_size = size;  } +static bool perf_event_validate_size(struct perf_event *event) +{ +	/* +	 * The values computed here will be over-written when we actually +	 * attach the event. +	 */ +	__perf_event_read_size(event, event->group_leader->nr_siblings + 1); +	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); +	perf_event__id_header_size(event); + +	/* +	 * Sum the lot; should not exceed the 64k limit we have on records. +	 * Conservative limit to allow for callchains and other variable fields. +	 */ +	if (event->read_size + event->header_size + +	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) +		return false; + +	return true; +} +  static void perf_group_attach(struct perf_event *event)  {  	struct perf_event *group_leader = event->group_leader, *pos; @@ -8297,13 +8322,35 @@ SYSCALL_DEFINE5(perf_event_open,  	if (move_group) {  		gctx = group_leader->ctx; +		mutex_lock_double(&gctx->mutex, &ctx->mutex); +	} else { +		mutex_lock(&ctx->mutex); +	} +	if (!perf_event_validate_size(event)) { +		err = -E2BIG; +		goto err_locked; +	} + +	/* +	 * Must be under the same ctx::mutex as perf_install_in_context(), +	 * because we need to serialize with concurrent event creation. +	 */ +	if (!exclusive_event_installable(event, ctx)) { +		/* exclusive and group stuff are assumed mutually exclusive */ +		WARN_ON_ONCE(move_group); + +		err = -EBUSY; +		goto err_locked; +	} + +	WARN_ON_ONCE(ctx->parent_ctx); + +	if (move_group) {  		/*  		 * See perf_event_ctx_lock() for comments on the details  		 * of swizzling perf_event::ctx.  		 */ -		mutex_lock_double(&gctx->mutex, &ctx->mutex); -  		perf_remove_from_context(group_leader, false);  		list_for_each_entry(sibling, &group_leader->sibling_list, @@ -8311,13 +8358,7 @@ SYSCALL_DEFINE5(perf_event_open,  			perf_remove_from_context(sibling, false);  			put_ctx(gctx);  		} -	} else { -		mutex_lock(&ctx->mutex); -	} -	WARN_ON_ONCE(ctx->parent_ctx); - -	if (move_group) {  		/*  		 * Wait for everybody to stop referencing the events through  		 * the old lists, before installing it on new lists. @@ -8349,22 +8390,29 @@ SYSCALL_DEFINE5(perf_event_open,  		perf_event__state_init(group_leader);  		perf_install_in_context(ctx, group_leader, group_leader->cpu);  		get_ctx(ctx); -	} -	if (!exclusive_event_installable(event, ctx)) { -		err = -EBUSY; -		mutex_unlock(&ctx->mutex); -		fput(event_file); -		goto err_context; +		/* +		 * Now that all events are installed in @ctx, nothing +		 * references @gctx anymore, so drop the last reference we have +		 * on it. +		 */ +		put_ctx(gctx);  	} +	/* +	 * Precalculate sample_data sizes; do while holding ctx::mutex such +	 * that we're serialized against further additions and before +	 * perf_install_in_context() which is the point the event is active and +	 * can use these values. +	 */ +	perf_event__header_size(event); +	perf_event__id_header_size(event); +  	perf_install_in_context(ctx, event, event->cpu);  	perf_unpin_context(ctx); -	if (move_group) { +	if (move_group)  		mutex_unlock(&gctx->mutex); -		put_ctx(gctx); -	}  	mutex_unlock(&ctx->mutex);  	put_online_cpus(); @@ -8376,12 +8424,6 @@ SYSCALL_DEFINE5(perf_event_open,  	mutex_unlock(¤t->perf_event_mutex);  	/* -	 * Precalculate sample_data sizes -	 */ -	perf_event__header_size(event); -	perf_event__id_header_size(event); - -	/*  	 * Drop the reference on the group_event after placing the  	 * new event on the sibling_list. This ensures destruction  	 * of the group leader will find the pointer to itself in @@ -8391,6 +8433,12 @@ SYSCALL_DEFINE5(perf_event_open,  	fd_install(event_fd, event_file);  	return event_fd; +err_locked: +	if (move_group) +		mutex_unlock(&gctx->mutex); +	mutex_unlock(&ctx->mutex); +/* err_file: */ +	fput(event_file);  err_context:  	perf_unpin_context(ctx);  	put_ctx(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f118a63..2845623fb582 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	tty_audit_fork(sig);  	sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS +	init_rwsem(&sig->group_rwsem); +#endif +  	sig->oom_score_adj = current->signal->oom_score_adj;  	sig->oom_score_adj_min = current->signal->oom_score_adj_min; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6e40a9539763..e28169dd1c36 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -83,7 +83,7 @@ int irq_set_handler_data(unsigned int irq, void *data)  	if (!desc)  		return -EINVAL; -	desc->irq_data.handler_data = data; +	desc->irq_common_data.handler_data = data;  	irq_put_desc_unlock(desc, flags);  	return 0;  } @@ -105,7 +105,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,  	if (!desc)  		return -EINVAL; -	desc->irq_data.msi_desc = entry; +	desc->irq_common_data.msi_desc = entry;  	if (entry && !irq_offset)  		entry->irq = irq_base;  	irq_put_desc_unlock(desc, flags); @@ -372,7 +372,6 @@ static bool irq_may_run(struct irq_desc *desc)  /**   *	handle_simple_irq - Simple and software-decoded IRQs. - *	@irq:	the interrupt number   *	@desc:	the interrupt description structure for this irq   *   *	Simple interrupts are either sent from a demultiplexing interrupt @@ -382,8 +381,7 @@ static bool irq_may_run(struct irq_desc *desc)   *	Note: The caller is expected to handle the ack, clear, mask and   *	unmask issues if necessary.   */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) +void handle_simple_irq(struct irq_desc *desc)  {  	raw_spin_lock(&desc->lock); @@ -425,7 +423,6 @@ static void cond_unmask_irq(struct irq_desc *desc)  /**   *	handle_level_irq - Level type irq handler - *	@irq:	the interrupt number   *	@desc:	the interrupt description structure for this irq   *   *	Level type interrupts are active as long as the hardware line has @@ -433,8 +430,7 @@ static void cond_unmask_irq(struct irq_desc *desc)   *	it after the associated handler has acknowledged the device, so the   *	interrupt line is back to inactive.   */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) +void handle_level_irq(struct irq_desc *desc)  {  	raw_spin_lock(&desc->lock);  	mask_ack_irq(desc); @@ -496,7 +492,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)  /**   *	handle_fasteoi_irq - irq handler for transparent controllers - *	@irq:	the interrupt number   *	@desc:	the interrupt description structure for this irq   *   *	Only a single callback will be issued to the chip: an ->eoi() @@ -504,8 +499,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)   *	for modern forms of interrupt handlers, which handle the flow   *	details in hardware, transparently.   */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_fasteoi_irq(struct irq_desc *desc)  {  	struct irq_chip *chip = desc->irq_data.chip; @@ -546,7 +540,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);  /**   *	handle_edge_irq - edge type IRQ handler - *	@irq:	the interrupt number   *	@desc:	the interrupt description structure for this irq   *   *	Interrupt occures on the falling and/or rising edge of a hardware @@ -560,8 +553,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);   *	the handler was running. If all pending interrupts are handled, the   *	loop is left.   */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_irq(struct irq_desc *desc)  {  	raw_spin_lock(&desc->lock); @@ -618,13 +610,12 @@ EXPORT_SYMBOL(handle_edge_irq);  #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER  /**   *	handle_edge_eoi_irq - edge eoi type IRQ handler - *	@irq:	the interrupt number   *	@desc:	the interrupt description structure for this irq   *   * Similar as the above handle_edge_irq, but using eoi and w/o the   * mask/unmask logic.   */ -void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_eoi_irq(struct irq_desc *desc)  {  	struct irq_chip *chip = irq_desc_get_chip(desc); @@ -665,13 +656,11 @@ out_eoi:  /**   *	handle_percpu_irq - Per CPU local irq handler - *	@irq:	the interrupt number   *	@desc:	the interrupt description structure for this irq   *   *	Per CPU interrupts on SMP machines without locking requirements   */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_irq(struct irq_desc *desc)  {  	struct irq_chip *chip = irq_desc_get_chip(desc); @@ -688,7 +677,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)  /**   * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids - * @irq:	the interrupt number   * @desc:	the interrupt description structure for this irq   *   * Per CPU interrupts on SMP machines without locking requirements. Same as @@ -698,11 +686,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)   * contain the real device id for the cpu on which this handler is   * called   */ -void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_devid_irq(struct irq_desc *desc)  {  	struct irq_chip *chip = irq_desc_get_chip(desc);  	struct irqaction *action = desc->action;  	void *dev_id = raw_cpu_ptr(action->percpu_dev_id); +	unsigned int irq = irq_desc_get_irq(desc);  	irqreturn_t res;  	kstat_incr_irqs_this_cpu(desc); @@ -796,7 +785,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,  		return;  	__irq_do_set_handler(desc, handle, 1, NULL); -	desc->irq_data.handler_data = data; +	desc->irq_common_data.handler_data = data;  	irq_put_desc_busunlock(desc, flags);  } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b6eeea8a80c5..e25a83b67cce 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -22,17 +22,19 @@  /**   * handle_bad_irq - handle spurious and unhandled irqs - * @irq:       the interrupt number   * @desc:      description of the interrupt   *   * Handles spurious and unhandled IRQ's. It also prints a debugmessage.   */ -void handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(struct irq_desc *desc)  { +	unsigned int irq = irq_desc_get_irq(desc); +  	print_irq_desc(irq, desc);  	kstat_incr_irqs_this_cpu(desc);  	ack_bad_irq(irq);  } +EXPORT_SYMBOL_GPL(handle_bad_irq);  /*   * Special, empty irq handler: diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index eee4b385cffb..5ef0c2dbe930 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -194,7 +194,7 @@ static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)  static inline int irq_desc_get_node(struct irq_desc *desc)  { -	return irq_data_get_node(&desc->irq_data); +	return irq_common_data_get_node(&desc->irq_common_data);  }  #ifdef CONFIG_PM_SLEEP diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0a2a4b697bcb..239e2ae2c947 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void)  #ifdef CONFIG_SMP  static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)  { -	if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) +	if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, +				     gfp, node))  		return -ENOMEM;  #ifdef CONFIG_GENERIC_PENDING_IRQ  	if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { -		free_cpumask_var(desc->irq_data.affinity); +		free_cpumask_var(desc->irq_common_data.affinity);  		return -ENOMEM;  	}  #endif @@ -52,11 +53,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)  static void desc_smp_init(struct irq_desc *desc, int node)  { -	desc->irq_data.node = node; -	cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +	cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);  #ifdef CONFIG_GENERIC_PENDING_IRQ  	cpumask_clear(desc->pending_mask);  #endif +#ifdef CONFIG_NUMA +	desc->irq_common_data.node = node; +#endif  }  #else @@ -70,12 +73,13 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,  {  	int cpu; +	desc->irq_common_data.handler_data = NULL; +	desc->irq_common_data.msi_desc = NULL; +  	desc->irq_data.common = &desc->irq_common_data;  	desc->irq_data.irq = irq;  	desc->irq_data.chip = &no_irq_chip;  	desc->irq_data.chip_data = NULL; -	desc->irq_data.handler_data = NULL; -	desc->irq_data.msi_desc = NULL;  	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);  	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);  	desc->handle_irq = handle_bad_irq; @@ -121,7 +125,7 @@ static void free_masks(struct irq_desc *desc)  #ifdef CONFIG_GENERIC_PENDING_IRQ  	free_cpumask_var(desc->pending_mask);  #endif -	free_cpumask_var(desc->irq_data.affinity); +	free_cpumask_var(desc->irq_common_data.affinity);  }  #else  static inline void free_masks(struct irq_desc *desc) { } @@ -343,7 +347,7 @@ int generic_handle_irq(unsigned int irq)  	if (!desc)  		return -EINVAL; -	generic_handle_irq_desc(irq, desc); +	generic_handle_irq_desc(desc);  	return 0;  }  EXPORT_SYMBOL_GPL(generic_handle_irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 79baaf8a7813..dc9d27c0c158 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -844,7 +844,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,  		child->parent_data = irq_data;  		irq_data->irq = child->irq;  		irq_data->common = child->common; -		irq_data->node = child->node;  		irq_data->domain = domain;  	} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ad1b064f94fe..f9a59f6cabd2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -192,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,  	switch (ret) {  	case IRQ_SET_MASK_OK:  	case IRQ_SET_MASK_OK_DONE: -		cpumask_copy(data->affinity, mask); +		cpumask_copy(desc->irq_common_data.affinity, mask);  	case IRQ_SET_MASK_OK_NOCOPY:  		irq_set_thread_affinity(desc);  		ret = 0; @@ -304,7 +304,7 @@ static void irq_affinity_notify(struct work_struct *work)  	if (irq_move_pending(&desc->irq_data))  		irq_get_pending(cpumask, desc);  	else -		cpumask_copy(cpumask, desc->irq_data.affinity); +		cpumask_copy(cpumask, desc->irq_common_data.affinity);  	raw_spin_unlock_irqrestore(&desc->lock, flags);  	notify->notify(notify, cpumask); @@ -375,9 +375,9 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)  	 * one of the targets is online.  	 */  	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { -		if (cpumask_intersects(desc->irq_data.affinity, +		if (cpumask_intersects(desc->irq_common_data.affinity,  				       cpu_online_mask)) -			set = desc->irq_data.affinity; +			set = desc->irq_common_data.affinity;  		else  			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);  	} @@ -829,8 +829,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)  	 * This code is triggered unconditionally. Check the affinity  	 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.  	 */ -	if (desc->irq_data.affinity) -		cpumask_copy(mask, desc->irq_data.affinity); +	if (desc->irq_common_data.affinity) +		cpumask_copy(mask, desc->irq_common_data.affinity);  	else  		valid = false;  	raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7e6512b9dc1f..be9149f62eb8 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -228,11 +228,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)  {  	struct irq_chip *chip = info->chip; -	BUG_ON(!chip); -	if (!chip->irq_mask) -		chip->irq_mask = pci_msi_mask_irq; -	if (!chip->irq_unmask) -		chip->irq_unmask = pci_msi_unmask_irq; +	BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);  	if (!chip->irq_set_affinity)  		chip->irq_set_affinity = msi_domain_set_affinity;  } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0e97c142ce40..a50ddc9417ff 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@  #include <linux/seq_file.h>  #include <linux/interrupt.h>  #include <linux/kernel_stat.h> +#include <linux/mutex.h>  #include "internals.h" @@ -39,7 +40,7 @@ static struct proc_dir_entry *root_irq_dir;  static int show_irq_affinity(int type, struct seq_file *m, void *v)  {  	struct irq_desc *desc = irq_to_desc((long)m->private); -	const struct cpumask *mask = desc->irq_data.affinity; +	const struct cpumask *mask = desc->irq_common_data.affinity;  #ifdef CONFIG_GENERIC_PENDING_IRQ  	if (irqd_is_setaffinity_pending(&desc->irq_data)) @@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)  void register_irq_proc(unsigned int irq, struct irq_desc *desc)  { +	static DEFINE_MUTEX(register_lock);  	char name [MAX_NAMELEN]; -	if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) +	if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))  		return; +	/* +	 * irq directories are registered only when a handler is +	 * added, not when the descriptor is created, so multiple +	 * tasks might try to register at the same time. +	 */ +	mutex_lock(®ister_lock); + +	if (desc->dir) +		goto out_unlock; +  	memset(name, 0, MAX_NAMELEN);  	sprintf(name, "%d", irq);  	/* create /proc/irq/1234 */  	desc->dir = proc_mkdir(name, root_irq_dir);  	if (!desc->dir) -		return; +		goto out_unlock;  #ifdef CONFIG_SMP  	/* create /proc/irq/<irq>/smp_affinity */ @@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)  	proc_create_data("spurious", 0444, desc->dir,  			 &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: +	mutex_unlock(®ister_lock);  }  void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dd95f44f99b2..b86886beee4f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)  		clear_bit(irq, irqs_resend);  		desc = irq_to_desc(irq);  		local_irq_disable(); -		desc->handle_irq(irq, desc); +		desc->handle_irq(desc);  		local_irq_enable();  	}  } diff --git a/kernel/kmod.c b/kernel/kmod.c index da98d0593de2..0277d1216f80 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work)  		call_usermodehelper_exec_sync(sub_info);  	} else {  		pid_t pid; - +		/* +		 * Use CLONE_PARENT to reparent it to kthreadd; we do not +		 * want to pollute current->children, and we need a parent +		 * that always ignores SIGCHLD to ensure auto-reaping. +		 */  		pid = kernel_thread(call_usermodehelper_exec_async, sub_info, -				    SIGCHLD); +				    CLONE_PARENT | SIGCHLD);  		if (pid < 0) {  			sub_info->retval = pid;  			umh_complete(sub_info); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8acfbf773e06..4e49cc4c9952 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);  static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  			  int trylock, int read, int check, int hardirqs_off,  			  struct lockdep_map *nest_lock, unsigned long ip, -			  int references) +			  int references, int pin_count)  {  	struct task_struct *curr = current;  	struct lock_class *class = NULL; @@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	hlock->waittime_stamp = 0;  	hlock->holdtime_stamp = lockstat_clock();  #endif -	hlock->pin_count = 0; +	hlock->pin_count = pin_count;  	if (check && !mark_irqflags(curr, hlock))  		return 0; @@ -3343,7 +3343,7 @@ found_it:  			hlock_class(hlock)->subclass, hlock->trylock,  				hlock->read, hlock->check, hlock->hardirqs_off,  				hlock->nest_lock, hlock->acquire_ip, -				hlock->references)) +				hlock->references, hlock->pin_count))  			return 0;  	} @@ -3433,7 +3433,7 @@ found_it:  			hlock_class(hlock)->subclass, hlock->trylock,  				hlock->read, hlock->check, hlock->hardirqs_off,  				hlock->nest_lock, hlock->acquire_ip, -				hlock->references)) +				hlock->references, hlock->pin_count))  			return 0;  	} @@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	current->lockdep_recursion = 1;  	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);  	__lock_acquire(lock, subclass, trylock, read, check, -		       irqs_disabled_flags(flags), nest_lock, ip, 0); +		       irqs_disabled_flags(flags), nest_lock, ip, 0, 0);  	current->lockdep_recursion = 0;  	raw_local_irq_restore(flags);  } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 337c8818541d..87e9ce6a63c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	if (pv_enabled())  		goto queue; -	if (virt_queued_spin_lock(lock)) +	if (virt_spin_lock(lock))  		return;  	/* diff --git a/kernel/memremap.c b/kernel/memremap.c index 72b0c66628b6..9d6b55587eaa 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -24,6 +24,16 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)  }  #endif +static void *try_ram_remap(resource_size_t offset, size_t size) +{ +	struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + +	/* In the simple case just return the existing linear address */ +	if (!PageHighMem(page)) +		return __va(offset); +	return NULL; /* fallback to ioremap_cache */ +} +  /**   * memremap() - remap an iomem_resource as cacheable memory   * @offset: iomem resource start address @@ -66,8 +76,8 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)  		 * the requested range is potentially in "System RAM"  		 */  		if (is_ram == REGION_INTERSECTS) -			addr = __va(offset); -		else +			addr = try_ram_remap(offset, size); +		if (!addr)  			addr = ioremap_cache(offset, size);  	} diff --git a/kernel/module.c b/kernel/module.c index b86b7bf1be38..8f051a106676 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr)  	if (core_kernel_text(a))  		return; -	/* module_text_address is safe here: we're supposed to have reference -	 * to module from symbol_get, so it can't go away. */ +	/* +	 * Even though we hold a reference on the module; we still need to +	 * disable preemption in order to safely traverse the data structure. +	 */ +	preempt_disable();  	modaddr = __module_text_address(a);  	BUG_ON(!modaddr);  	module_put(modaddr); +	preempt_enable();  }  EXPORT_SYMBOL_GPL(symbol_put_addr); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9f75f25cc5d9..775d36cc0050 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)  static void __init  rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  { +	static struct lock_class_key rcu_exp_sched_rdp_class;  	unsigned long flags;  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);  	struct rcu_node *rnp = rcu_get_root(rsp); @@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  	mutex_init(&rdp->exp_funnel_mutex);  	rcu_boot_init_nocb_percpu_data(rdp);  	raw_spin_unlock_irqrestore(&rnp->lock, flags); +	if (rsp == &rcu_sched_state) +		lockdep_set_class_and_name(&rdp->exp_funnel_mutex, +					   &rcu_exp_sched_rdp_class, +					   "rcu_data_exp_sched");  }  /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403921bd..bcd214e4b4d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -621,18 +621,21 @@ int get_nohz_timer_target(void)  	int i, cpu = smp_processor_id();  	struct sched_domain *sd; -	if (!idle_cpu(cpu)) +	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))  		return cpu;  	rcu_read_lock();  	for_each_domain(cpu, sd) {  		for_each_cpu(i, sched_domain_span(sd)) { -			if (!idle_cpu(i)) { +			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {  				cpu = i;  				goto unlock;  			}  		}  	} + +	if (!is_housekeeping_cpu(cpu)) +		cpu = housekeeping_any_cpu();  unlock:  	rcu_read_unlock();  	return cpu; @@ -2363,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p)  	trace_sched_wakeup_new(p);  	check_preempt_curr(rq, p, WF_FORK);  #ifdef CONFIG_SMP -	if (p->sched_class->task_woken) +	if (p->sched_class->task_woken) { +		/* +		 * Nothing relies on rq->lock after this, so its fine to +		 * drop it. +		 */ +		lockdep_unpin_lock(&rq->lock);  		p->sched_class->task_woken(rq, p); +		lockdep_pin_lock(&rq->lock); +	}  #endif  	task_rq_unlock(rq, p, &flags);  } @@ -2514,11 +2524,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)  	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls  	 * schedule one last time. The schedule call will never return, and  	 * the scheduled task must drop that reference. -	 * The test for TASK_DEAD must occur while the runqueue locks are -	 * still held, otherwise prev could be scheduled on another cpu, die -	 * there before we look at prev->state, and then the reference would -	 * be dropped twice. -	 *		Manfred Spraul <manfred@colorfullife.com> +	 * +	 * We must observe prev->state before clearing prev->on_cpu (in +	 * finish_lock_switch), otherwise a concurrent wakeup can get prev +	 * running on another CPU and we could rave with its RUNNING -> DEAD +	 * transition, resulting in a double drop.  	 */  	prev_state = prev->state;  	vtime_task_switch(prev); @@ -2666,13 +2676,20 @@ unsigned long nr_running(void)  /*   * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race.  The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop)   */  bool single_task_running(void)  { -	if (cpu_rq(smp_processor_id())->nr_running == 1) -		return true; -	else -		return false; +	return raw_rq()->nr_running == 1;  }  EXPORT_SYMBOL(single_task_running); @@ -4924,7 +4941,15 @@ void init_idle(struct task_struct *idle, int cpu)  	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); -	do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP +	/* +	 * Its possible that init_idle() gets called multiple times on a task, +	 * in that case do_set_cpus_allowed() will not do the right thing. +	 * +	 * And since this is boot we can forgo the serialization. +	 */ +	set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif  	/*  	 * We're having a chicken and egg problem, even though we are  	 * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4941,7 +4966,7 @@ void init_idle(struct task_struct *idle, int cpu)  	rq->curr = rq->idle = idle;  	idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP  	idle->on_cpu = 1;  #endif  	raw_spin_unlock(&rq->lock); @@ -4956,7 +4981,7 @@ void init_idle(struct task_struct *idle, int cpu)  	idle->sched_class = &idle_sched_class;  	ftrace_graph_init_idle_task(idle, cpu);  	vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP  	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);  #endif  } @@ -5178,24 +5203,47 @@ static void migrate_tasks(struct rq *dead_rq)  			break;  		/* -		 * Ensure rq->lock covers the entire task selection -		 * until the migration. +		 * pick_next_task assumes pinned rq->lock.  		 */  		lockdep_pin_lock(&rq->lock);  		next = pick_next_task(rq, &fake_task);  		BUG_ON(!next);  		next->sched_class->put_prev_task(rq, next); +		/* +		 * Rules for changing task_struct::cpus_allowed are holding +		 * both pi_lock and rq->lock, such that holding either +		 * stabilizes the mask. +		 * +		 * Drop rq->lock is not quite as disastrous as it usually is +		 * because !cpu_active at this point, which means load-balance +		 * will not interfere. Also, stop-machine. +		 */ +		lockdep_unpin_lock(&rq->lock); +		raw_spin_unlock(&rq->lock); +		raw_spin_lock(&next->pi_lock); +		raw_spin_lock(&rq->lock); + +		/* +		 * Since we're inside stop-machine, _nothing_ should have +		 * changed the task, WARN if weird stuff happened, because in +		 * that case the above rq->lock drop is a fail too. +		 */ +		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { +			raw_spin_unlock(&next->pi_lock); +			continue; +		} +  		/* Find suitable destination for @next, with force if needed. */  		dest_cpu = select_fallback_rq(dead_rq->cpu, next); -		lockdep_unpin_lock(&rq->lock);  		rq = __migrate_task(rq, next, dest_cpu);  		if (rq != dead_rq) {  			raw_spin_unlock(&rq->lock);  			rq = dead_rq;  			raw_spin_lock(&rq->lock);  		} +		raw_spin_unlock(&next->pi_lock);  	}  	rq->stop = stop; @@ -7197,9 +7245,6 @@ void __init sched_init_smp(void)  	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);  	alloc_cpumask_var(&fallback_doms, GFP_KERNEL); -	/* nohz_full won't take effect without isolating the cpus. */ -	tick_nohz_full_add_cpus_to(cpu_isolated_map); -  	sched_init_numa();  	/* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc8f01083527..8b0a15e285f9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  	 * Queueing this task back might have overloaded rq, check if we need  	 * to kick someone away.  	 */ -	if (has_pushable_dl_tasks(rq)) +	if (has_pushable_dl_tasks(rq)) { +		/* +		 * Nothing relies on rq->lock after this, so its safe to drop +		 * rq->lock. +		 */ +		lockdep_unpin_lock(&rq->lock);  		push_dl_task(rq); +		lockdep_pin_lock(&rq->lock); +	}  #endif  unlock: @@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)  		int target = find_later_rq(p);  		if (target != -1 && -				dl_time_before(p->dl.deadline, -					cpu_rq(target)->dl.earliest_dl.curr)) +				(dl_time_before(p->dl.deadline, +					cpu_rq(target)->dl.earliest_dl.curr) || +				(cpu_rq(target)->dl.dl_nr_running == 0)))  			cpu = target;  	}  	rcu_read_unlock(); @@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)  		later_rq = cpu_rq(cpu); -		if (!dl_time_before(task->dl.deadline, +		if (later_rq->dl.dl_nr_running && +		    !dl_time_before(task->dl.deadline,  					later_rq->dl.earliest_dl.curr)) {  			/*  			 * Target rq has tasks of equal or earlier deadline, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e2e3483b1ec..9a5e60fe721a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2363,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)  	 */  	tg_weight = atomic_long_read(&tg->load_avg);  	tg_weight -= cfs_rq->tg_load_avg_contrib; -	tg_weight += cfs_rq_load_avg(cfs_rq); +	tg_weight += cfs_rq->load.weight;  	return tg_weight;  } @@ -2373,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  	long tg_weight, load, shares;  	tg_weight = calc_tg_weight(tg, cfs_rq); -	load = cfs_rq_load_avg(cfs_rq); +	load = cfs_rq->load.weight;  	shares = (tg->shares * load);  	if (tg_weight) @@ -2664,13 +2664,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */  static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  { -	int decayed;  	struct sched_avg *sa = &cfs_rq->avg; +	int decayed, removed = 0;  	if (atomic_long_read(&cfs_rq->removed_load_avg)) {  		long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);  		sa->load_avg = max_t(long, sa->load_avg - r, 0);  		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); +		removed = 1;  	}  	if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -2688,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)  	cfs_rq->load_last_update_time_copy = sa->last_update_time;  #endif -	return decayed; +	return decayed || removed;  }  /* Update task and its cfs_rq load average */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f177c73ae19..4a2ef5a02fd3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)  	rcu_idle_enter();  	trace_cpu_idle_rcuidle(0, smp_processor_id());  	local_irq_enable(); +	stop_critical_timings();  	while (!tif_need_resched() &&  		(cpu_idle_force_poll || tick_check_broadcast_expired()))  		cpu_relax(); +	start_critical_timings();  	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());  	rcu_idle_exit();  	return 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda117574c..6d2a119c7ad9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  	 * After ->on_cpu is cleared, the task can be moved to a different CPU.  	 * We must ensure this doesn't happen until the switch is completely  	 * finished. +	 * +	 * Pairs with the control dependency and rmb in try_to_wake_up().  	 */ -	smp_wmb(); -	prev->on_cpu = 0; +	smp_store_release(&prev->on_cpu, 0);  #endif  #ifdef CONFIG_DEBUG_SPINLOCK  	/* this is a valid case when another task releases the spinlock */ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d9322bc5d..052e02672d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)  }  EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, -			  void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)  { -	__wake_up_common(q, mode, nr, 0, key); +	__wake_up_common(q, mode, 1, 0, key);  }  EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,  	if (!list_empty(&wait->task_list))  		list_del_init(&wait->task_list);  	else if (waitqueue_active(q)) -		__wake_up_locked_key(q, mode, 1, key); +		__wake_up_locked_key(q, mode, key);  	spin_unlock_irqrestore(&q->lock, flags);  }  EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 50eb107f1198..a9b76a40319e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);  static int __clockevents_switch_state(struct clock_event_device *dev,  				      enum clock_event_state state)  { -	/* Transition with legacy set_mode() callback */ -	if (dev->set_mode) { -		/* Legacy callback doesn't support new modes */ -		if (state > CLOCK_EVT_STATE_ONESHOT) -			return -ENOSYS; -		/* -		 * 'clock_event_state' and 'clock_event_mode' have 1-to-1 -		 * mapping until *_ONESHOT, and so a simple cast will work. -		 */ -		dev->set_mode((enum clock_event_mode)state, dev); -		dev->mode = (enum clock_event_mode)state; -		return 0; -	} -  	if (dev->features & CLOCK_EVT_FEAT_DUMMY)  		return 0; @@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)  {  	int ret = 0; -	if (dev->set_mode) { -		dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); -		dev->mode = CLOCK_EVT_MODE_RESUME; -	} else if (dev->tick_resume) { +	if (dev->tick_resume)  		ret = dev->tick_resume(dev); -	}  	return ret;  } @@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)  }  EXPORT_SYMBOL_GPL(clockevents_unbind_device); -/* Sanity check of state transition callbacks */ -static int clockevents_sanity_check(struct clock_event_device *dev) -{ -	/* Legacy set_mode() callback */ -	if (dev->set_mode) { -		/* We shouldn't be supporting new modes now */ -		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || -			dev->set_state_shutdown || dev->tick_resume || -			dev->set_state_oneshot_stopped); - -		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); -		return 0; -	} - -	if (dev->features & CLOCK_EVT_FEAT_DUMMY) -		return 0; - -	return 0; -} -  /**   * clockevents_register_device - register a clock event device   * @dev:	device to register @@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)  {  	unsigned long flags; -	BUG_ON(clockevents_sanity_check(dev)); -  	/* Initialize state to DETACHED */  	clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 841b72f720e8..3a38775b50c2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)  			continue;  		/* Check the deviation from the watchdog clocksource. */ -		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { +		if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {  			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",  				cs->name);  			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n", diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d11c55b6ab7d..4fcd99e12aa0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu)  		 * the set mode function!  		 */  		clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); -		dev->mode = CLOCK_EVT_MODE_UNUSED;  		clockevents_exchange_device(dev, NULL);  		dev->event_handler = clockevents_handle_noop;  		td->evtdev = NULL; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3319e16f31e5..7c7ec4515983 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)  __setup("nohz_full=", tick_nohz_full_setup);  static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, -						 unsigned long action, -						 void *hcpu) +				       unsigned long action, +				       void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu;  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_DOWN_PREPARE:  		/* -		 * If we handle the timekeeping duty for full dynticks CPUs, -		 * we can't safely shutdown that CPU. +		 * The boot CPU handles housekeeping duty (unbound timers, +		 * workqueues, timekeeping, ...) on behalf of full dynticks +		 * CPUs. It must remain online when nohz full is enabled.  		 */  		if (tick_nohz_full_running && tick_do_timer_cpu == cpu)  			return NOTIFY_BAD; @@ -370,6 +371,12 @@ void __init tick_nohz_init(void)  	cpu_notifier(tick_nohz_cpu_down_callback, 0);  	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",  		cpumask_pr_args(tick_nohz_full_mask)); + +	/* +	 * We need at least one CPU to handle housekeeping work such +	 * as timekeeping, unbound timers, workqueues, ... +	 */ +	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));  }  #endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f6ee2e6b6f5d..44d2cc0436f4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1251,7 +1251,7 @@ void __init timekeeping_init(void)  	set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);  	tk_set_wall_to_mono(tk, tmp); -	timekeeping_update(tk, TK_MIRROR); +	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);  	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,  	negative = (tick_error < 0);  	/* Sort out the magnitude of the correction */ -	tick_error = abs(tick_error); +	tick_error = abs64(tick_error);  	for (adj = 0; tick_error > interval; adj++)  		tick_error >>= 1; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 129c96033e46..f75e35b60149 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  		   (unsigned long long) dev->min_delta_ns);  	SEQ_printf(m, " mult:           %u\n", dev->mult);  	SEQ_printf(m, " shift:          %u\n", dev->shift); -	SEQ_printf(m, " mode:           %d\n", dev->mode); +	SEQ_printf(m, " mode:           %d\n", clockevent_get_state(dev));  	SEQ_printf(m, " next_event:     %Ld nsecs\n",  		   (unsigned long long) ktime_to_ns(dev->next_event)); @@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  	print_name_offset(m, dev->set_next_event);  	SEQ_printf(m, "\n"); -	if (dev->set_mode) { -		SEQ_printf(m, " set_mode:       "); -		print_name_offset(m, dev->set_mode); +	if (dev->set_state_shutdown) { +		SEQ_printf(m, " shutdown: "); +		print_name_offset(m, dev->set_state_shutdown);  		SEQ_printf(m, "\n"); -	} else { -		if (dev->set_state_shutdown) { -			SEQ_printf(m, " shutdown: "); -			print_name_offset(m, dev->set_state_shutdown); -			SEQ_printf(m, "\n"); -		} +	} -		if (dev->set_state_periodic) { -			SEQ_printf(m, " periodic: "); -			print_name_offset(m, dev->set_state_periodic); -			SEQ_printf(m, "\n"); -		} +	if (dev->set_state_periodic) { +		SEQ_printf(m, " periodic: "); +		print_name_offset(m, dev->set_state_periodic); +		SEQ_printf(m, "\n"); +	} -		if (dev->set_state_oneshot) { -			SEQ_printf(m, " oneshot:  "); -			print_name_offset(m, dev->set_state_oneshot); -			SEQ_printf(m, "\n"); -		} +	if (dev->set_state_oneshot) { +		SEQ_printf(m, " oneshot:  "); +		print_name_offset(m, dev->set_state_oneshot); +		SEQ_printf(m, "\n"); +	} -		if (dev->set_state_oneshot_stopped) { -			SEQ_printf(m, " oneshot stopped: "); -			print_name_offset(m, dev->set_state_oneshot_stopped); -			SEQ_printf(m, "\n"); -		} +	if (dev->set_state_oneshot_stopped) { +		SEQ_printf(m, " oneshot stopped: "); +		print_name_offset(m, dev->set_state_oneshot_stopped); +		SEQ_printf(m, "\n"); +	} -		if (dev->tick_resume) { -			SEQ_printf(m, " resume:   "); -			print_name_offset(m, dev->tick_resume); -			SEQ_printf(m, "\n"); -		} +	if (dev->tick_resume) { +		SEQ_printf(m, " resume:   "); +		print_name_offset(m, dev->tick_resume); +		SEQ_printf(m, "\n");  	}  	SEQ_printf(m, " event_handler:  "); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b746399ab59c..8abf1ba18085 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -85,9 +85,19 @@ check_stack(unsigned long ip, unsigned long *stack)  	if (!object_is_on_stack(stack))  		return; +	/* Can't do this from NMI context (can cause deadlocks) */ +	if (in_nmi()) +		return; +  	local_irq_save(flags);  	arch_spin_lock(&max_stack_lock); +	/* +	 * RCU may not be watching, make it see us. +	 * The stack trace code uses rcu_sched. +	 */ +	rcu_irq_enter(); +  	/* In case another CPU set the tracer_frame on us */  	if (unlikely(!frame_size))  		this_size -= tracer_frame; @@ -169,6 +179,7 @@ check_stack(unsigned long ip, unsigned long *stack)  	}   out: +	rcu_irq_exit();  	arch_spin_unlock(&max_stack_lock);  	local_irq_restore(flags);  } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca71582fcfab..bcb14cafe007 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,  	timer_stats_timer_set_start_info(&dwork->timer);  	dwork->wq = wq; +	/* timer isn't guaranteed to run in this cpu, record earlier */ +	if (cpu == WORK_CPU_UNBOUND) +		cpu = raw_smp_processor_id();  	dwork->cpu = cpu;  	timer->expires = jiffies + delay; -	if (unlikely(cpu != WORK_CPU_UNBOUND)) -		add_timer_on(timer, cpu); -	else -		add_timer(timer); +	add_timer_on(timer, cpu);  }  /**  | 

