diff options
Diffstat (limited to 'kernel')
130 files changed, 6968 insertions, 3176 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ef0d95a190b4..daad787fb795 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,12 +58,14 @@ endif  obj-$(CONFIG_UID16) += uid16.o  obj-$(CONFIG_MODULES) += module.o  obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o  obj-$(CONFIG_KALLSYMS) += kallsyms.o  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o  obj-$(CONFIG_CRASH_CORE) += crash_core.o  obj-$(CONFIG_KEXEC_CORE) += kexec_core.o  obj-$(CONFIG_KEXEC) += kexec.o  obj-$(CONFIG_KEXEC_FILE) += kexec_file.o +obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o  obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o  obj-$(CONFIG_COMPAT) += compat.o  obj-$(CONFIG_CGROUPS) += cgroup/ @@ -126,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE  $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz  quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz -cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ +      cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@  $(obj)/kheaders_data.tar.xz: FORCE  	$(call cmd,genikh) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 29d781061cd5..e1d9adb212f9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -22,3 +22,6 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o  ifeq ($(CONFIG_INET),y)  obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o  endif +ifeq ($(CONFIG_SYSFS),y) +obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o +endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5fcc7a17eb5a..29c7c06c6bd6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -195,8 +195,8 @@  	     i < btf_type_vlen(struct_type);					\  	     i++, member++) -static DEFINE_IDR(btf_idr); -static DEFINE_SPINLOCK(btf_idr_lock); +DEFINE_IDR(btf_idr); +DEFINE_SPINLOCK(btf_idr_lock);  struct btf {  	void *data; @@ -2332,7 +2332,7 @@ static int btf_enum_check_kflag_member(struct btf_verifier_env *env,  		if (BITS_PER_BYTE_MASKED(struct_bits_off)) {  			btf_verifier_log_member(env, struct_type, member,  						"Member is not byte aligned"); -				return -EINVAL; +			return -EINVAL;  		}  		nr_bits = int_bitsize; @@ -2377,9 +2377,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,  		return -EINVAL;  	} -	if (t->size != sizeof(int)) { -		btf_verifier_log_type(env, t, "Expected size:%zu", -				      sizeof(int)); +	if (t->size > 8 || !is_power_of_2(t->size)) { +		btf_verifier_log_type(env, t, "Unexpected size");  		return -EINVAL;  	} @@ -3376,6 +3375,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,  	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);  } +#ifdef CONFIG_PROC_FS +static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) +{ +	const struct btf *btf = filp->private_data; + +	seq_printf(m, "btf_id:\t%u\n", btf->id); +} +#endif +  static int btf_release(struct inode *inode, struct file *filp)  {  	btf_put(filp->private_data); @@ -3383,6 +3391,9 @@ static int btf_release(struct inode *inode, struct file *filp)  }  const struct file_operations btf_fops = { +#ifdef CONFIG_PROC_FS +	.show_fdinfo	= bpf_btf_show_fdinfo, +#endif  	.release	= btf_release,  }; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a00eaca6fae..ddd8addcdb5c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -964,7 +964,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)  		return -ENOMEM;  	ctx->optval_end = ctx->optval + max_optlen; -	ctx->optlen = max_optlen;  	return 0;  } @@ -984,7 +983,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,  		.level = *level,  		.optname = *optname,  	}; -	int ret; +	int ret, max_optlen;  	/* Opportunistic check to see whether we have any BPF program  	 * attached to the hook so we don't waste time allocating @@ -994,10 +993,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,  	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))  		return 0; -	ret = sockopt_alloc_buf(&ctx, *optlen); +	/* Allocate a bit more than the initial user buffer for +	 * BPF program. The canonical use case is overriding +	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). +	 */ +	max_optlen = max_t(int, 16, *optlen); + +	ret = sockopt_alloc_buf(&ctx, max_optlen);  	if (ret)  		return ret; +	ctx.optlen = *optlen; +  	if (copy_from_user(ctx.optval, optval, *optlen) != 0) {  		ret = -EFAULT;  		goto out; @@ -1016,7 +1023,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,  	if (ctx.optlen == -1) {  		/* optlen set to -1, bypass kernel */  		ret = 1; -	} else if (ctx.optlen > *optlen || ctx.optlen < -1) { +	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {  		/* optlen is out of bounds */  		ret = -EFAULT;  	} else { @@ -1063,6 +1070,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,  	if (ret)  		return ret; +	ctx.optlen = max_optlen; +  	if (!retval) {  		/* If kernel getsockopt finished successfully,  		 * copy whatever was returned to the user back @@ -1325,6 +1334,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,  				     struct bpf_prog *prog, u32 *target_size)  {  	struct bpf_insn *insn = insn_buf; +	u32 read_size;  	switch (si->off) {  	case offsetof(struct bpf_sysctl, write): @@ -1356,7 +1366,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,  				treg, si->dst_reg,  				offsetof(struct bpf_sysctl_kern, ppos));  			*insn++ = BPF_STX_MEM( -				BPF_SIZEOF(u32), treg, si->src_reg, 0); +				BPF_SIZEOF(u32), treg, si->src_reg, +				bpf_ctx_narrow_access_offset( +					0, sizeof(u32), sizeof(loff_t)));  			*insn++ = BPF_LDX_MEM(  				BPF_DW, treg, si->dst_reg,  				offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -1365,8 +1377,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,  				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),  				si->dst_reg, si->src_reg,  				offsetof(struct bpf_sysctl_kern, ppos)); +			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));  			*insn++ = BPF_LDX_MEM( -				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, +				bpf_ctx_narrow_access_offset( +					0, read_size, sizeof(loff_t)));  		}  		*target_size = sizeof(u32);  		break; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d83cf8ccc872..d27f3b60ff6d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -37,6 +37,12 @@   * notifier hook walks the map we know that new dev references can not be   * added by the user because core infrastructure ensures dev_get_by_index()   * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different.   */  #include <linux/bpf.h>  #include <net/xdp.h> @@ -59,10 +65,11 @@ struct xdp_bulk_queue {  struct bpf_dtab_netdev {  	struct net_device *dev; /* must be first member, due to tracepoint */ +	struct hlist_node index_hlist;  	struct bpf_dtab *dtab; -	unsigned int bit;  	struct xdp_bulk_queue __percpu *bulkq;  	struct rcu_head rcu; +	unsigned int idx; /* keep track of map index for tracepoint */  };  struct bpf_dtab { @@ -70,33 +77,45 @@ struct bpf_dtab {  	struct bpf_dtab_netdev **netdev_map;  	struct list_head __percpu *flush_list;  	struct list_head list; + +	/* these are only used for DEVMAP_HASH type maps */ +	struct hlist_head *dev_index_head; +	spinlock_t index_lock; +	unsigned int items; +	u32 n_buckets;  };  static DEFINE_SPINLOCK(dev_map_lock);  static LIST_HEAD(dev_map_list); -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ +	int i; +	struct hlist_head *hash; + +	hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); +	if (hash != NULL) +		for (i = 0; i < entries; i++) +			INIT_HLIST_HEAD(&hash[i]); + +	return hash; +} + +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)  { -	struct bpf_dtab *dtab;  	int err, cpu;  	u64 cost; -	if (!capable(CAP_NET_ADMIN)) -		return ERR_PTR(-EPERM); -  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 ||  	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) -		return ERR_PTR(-EINVAL); +		return -EINVAL;  	/* Lookup returns a pointer straight to dev->ifindex, so make sure the  	 * verifier prevents writes from the BPF side  	 */  	attr->map_flags |= BPF_F_RDONLY_PROG; -	dtab = kzalloc(sizeof(*dtab), GFP_USER); -	if (!dtab) -		return ERR_PTR(-ENOMEM);  	bpf_map_init_from_attr(&dtab->map, attr); @@ -104,12 +123,18 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);  	cost += sizeof(struct list_head) * num_possible_cpus(); +	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +		dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + +		if (!dtab->n_buckets) /* Overflow check */ +			return -EINVAL; +		cost += sizeof(struct hlist_head) * dtab->n_buckets; +	} +  	/* if map size is larger than memlock limit, reject it */  	err = bpf_map_charge_init(&dtab->map.memory, cost);  	if (err) -		goto free_dtab; - -	err = -ENOMEM; +		return -EINVAL;  	dtab->flush_list = alloc_percpu(struct list_head);  	if (!dtab->flush_list) @@ -124,19 +149,48 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  	if (!dtab->netdev_map)  		goto free_percpu; -	spin_lock(&dev_map_lock); -	list_add_tail_rcu(&dtab->list, &dev_map_list); -	spin_unlock(&dev_map_lock); +	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +		dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); +		if (!dtab->dev_index_head) +			goto free_map_area; -	return &dtab->map; +		spin_lock_init(&dtab->index_lock); +	} +	return 0; + +free_map_area: +	bpf_map_area_free(dtab->netdev_map);  free_percpu:  	free_percpu(dtab->flush_list);  free_charge:  	bpf_map_charge_finish(&dtab->map.memory); -free_dtab: -	kfree(dtab); -	return ERR_PTR(err); +	return -ENOMEM; +} + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ +	struct bpf_dtab *dtab; +	int err; + +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM); + +	dtab = kzalloc(sizeof(*dtab), GFP_USER); +	if (!dtab) +		return ERR_PTR(-ENOMEM); + +	err = dev_map_init_map(dtab, attr); +	if (err) { +		kfree(dtab); +		return ERR_PTR(err); +	} + +	spin_lock(&dev_map_lock); +	list_add_tail_rcu(&dtab->list, &dev_map_list); +	spin_unlock(&dev_map_lock); + +	return &dtab->map;  }  static void dev_map_free(struct bpf_map *map) @@ -188,6 +242,7 @@ static void dev_map_free(struct bpf_map *map)  	free_percpu(dtab->flush_list);  	bpf_map_area_free(dtab->netdev_map); +	kfree(dtab->dev_index_head);  	kfree(dtab);  } @@ -208,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  	return 0;  } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, +						    int idx) +{ +	return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +	struct hlist_head *head = dev_map_index_hash(dtab, key); +	struct bpf_dtab_netdev *dev; + +	hlist_for_each_entry_rcu(dev, head, index_hlist) +		if (dev->idx == key) +			return dev; + +	return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, +				    void *next_key) +{ +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +	u32 idx, *next = next_key; +	struct bpf_dtab_netdev *dev, *next_dev; +	struct hlist_head *head; +	int i = 0; + +	if (!key) +		goto find_first; + +	idx = *(u32 *)key; + +	dev = __dev_map_hash_lookup_elem(map, idx); +	if (!dev) +		goto find_first; + +	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), +				    struct bpf_dtab_netdev, index_hlist); + +	if (next_dev) { +		*next = next_dev->idx; +		return 0; +	} + +	i = idx & (dtab->n_buckets - 1); +	i++; + + find_first: +	for (; i < dtab->n_buckets; i++) { +		head = dev_map_index_hash(dtab, i); + +		next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), +					    struct bpf_dtab_netdev, +					    index_hlist); +		if (next_dev) { +			*next = next_dev->idx; +			return 0; +		} +	} + +	return -ENOENT; +} +  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,  		       bool in_napi_ctx)  { @@ -235,7 +354,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,  out:  	bq->count = 0; -	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, +	trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,  			      sent, drops, bq->dev_rx, dev, err);  	bq->dev_rx = NULL;  	__list_del_clearprev(&bq->flush_node); @@ -363,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)  	return dev ? &dev->ifindex : NULL;  } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ +	struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, +								*(u32 *)key); +	struct net_device *dev = obj ? obj->dev : NULL; + +	return dev ? &dev->ifindex : NULL; +} +  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)  {  	if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -412,17 +540,74 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)  	return 0;  } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, -				u64 map_flags) +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); -	struct net *net = current->nsproxy->net_ns; +	struct bpf_dtab_netdev *old_dev; +	int k = *(u32 *)key; +	unsigned long flags; +	int ret = -ENOENT; + +	spin_lock_irqsave(&dtab->index_lock, flags); + +	old_dev = __dev_map_hash_lookup_elem(map, k); +	if (old_dev) { +		dtab->items--; +		hlist_del_init_rcu(&old_dev->index_hlist); +		call_rcu(&old_dev->rcu, __dev_map_entry_free); +		ret = 0; +	} +	spin_unlock_irqrestore(&dtab->index_lock, flags); + +	return ret; +} + +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, +						    struct bpf_dtab *dtab, +						    u32 ifindex, +						    unsigned int idx) +{  	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; +	struct bpf_dtab_netdev *dev; +	struct xdp_bulk_queue *bq; +	int cpu; + +	dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); +	if (!dev) +		return ERR_PTR(-ENOMEM); + +	dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), +					sizeof(void *), gfp); +	if (!dev->bulkq) { +		kfree(dev); +		return ERR_PTR(-ENOMEM); +	} + +	for_each_possible_cpu(cpu) { +		bq = per_cpu_ptr(dev->bulkq, cpu); +		bq->obj = dev; +	} + +	dev->dev = dev_get_by_index(net, ifindex); +	if (!dev->dev) { +		free_percpu(dev->bulkq); +		kfree(dev); +		return ERR_PTR(-EINVAL); +	} + +	dev->idx = idx; +	dev->dtab = dtab; + +	return dev; +} + +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, +				 void *key, void *value, u64 map_flags) +{ +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);  	struct bpf_dtab_netdev *dev, *old_dev;  	u32 ifindex = *(u32 *)value; -	struct xdp_bulk_queue *bq;  	u32 i = *(u32 *)key; -	int cpu;  	if (unlikely(map_flags > BPF_EXIST))  		return -EINVAL; @@ -434,31 +619,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,  	if (!ifindex) {  		dev = NULL;  	} else { -		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); -		if (!dev) -			return -ENOMEM; - -		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), -						sizeof(void *), gfp); -		if (!dev->bulkq) { -			kfree(dev); -			return -ENOMEM; -		} - -		for_each_possible_cpu(cpu) { -			bq = per_cpu_ptr(dev->bulkq, cpu); -			bq->obj = dev; -		} - -		dev->dev = dev_get_by_index(net, ifindex); -		if (!dev->dev) { -			free_percpu(dev->bulkq); -			kfree(dev); -			return -EINVAL; -		} - -		dev->bit = i; -		dev->dtab = dtab; +		dev = __dev_map_alloc_node(net, dtab, ifindex, i); +		if (IS_ERR(dev)) +			return PTR_ERR(dev);  	}  	/* Use call_rcu() here to ensure rcu critical sections have completed @@ -472,6 +635,70 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,  	return 0;  } +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, +			       u64 map_flags) +{ +	return __dev_map_update_elem(current->nsproxy->net_ns, +				     map, key, value, map_flags); +} + +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, +				     void *key, void *value, u64 map_flags) +{ +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +	struct bpf_dtab_netdev *dev, *old_dev; +	u32 ifindex = *(u32 *)value; +	u32 idx = *(u32 *)key; +	unsigned long flags; +	int err = -EEXIST; + +	if (unlikely(map_flags > BPF_EXIST || !ifindex)) +		return -EINVAL; + +	spin_lock_irqsave(&dtab->index_lock, flags); + +	old_dev = __dev_map_hash_lookup_elem(map, idx); +	if (old_dev && (map_flags & BPF_NOEXIST)) +		goto out_err; + +	dev = __dev_map_alloc_node(net, dtab, ifindex, idx); +	if (IS_ERR(dev)) { +		err = PTR_ERR(dev); +		goto out_err; +	} + +	if (old_dev) { +		hlist_del_rcu(&old_dev->index_hlist); +	} else { +		if (dtab->items >= dtab->map.max_entries) { +			spin_unlock_irqrestore(&dtab->index_lock, flags); +			call_rcu(&dev->rcu, __dev_map_entry_free); +			return -E2BIG; +		} +		dtab->items++; +	} + +	hlist_add_head_rcu(&dev->index_hlist, +			   dev_map_index_hash(dtab, idx)); +	spin_unlock_irqrestore(&dtab->index_lock, flags); + +	if (old_dev) +		call_rcu(&old_dev->rcu, __dev_map_entry_free); + +	return 0; + +out_err: +	spin_unlock_irqrestore(&dtab->index_lock, flags); +	return err; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, +				   u64 map_flags) +{ +	return __dev_map_hash_update_elem(current->nsproxy->net_ns, +					 map, key, value, map_flags); +} +  const struct bpf_map_ops dev_map_ops = {  	.map_alloc = dev_map_alloc,  	.map_free = dev_map_free, @@ -482,6 +709,16 @@ const struct bpf_map_ops dev_map_ops = {  	.map_check_btf = map_check_no_btf,  }; +const struct bpf_map_ops dev_map_hash_ops = { +	.map_alloc = dev_map_alloc, +	.map_free = dev_map_free, +	.map_get_next_key = dev_map_hash_get_next_key, +	.map_lookup_elem = dev_map_hash_lookup_elem, +	.map_update_elem = dev_map_hash_update_elem, +	.map_delete_elem = dev_map_hash_delete_elem, +	.map_check_btf = map_check_no_btf, +}; +  static int dev_map_notification(struct notifier_block *notifier,  				ulong event, void *ptr)  { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index cc0d0cf114e3..a70f7209cda3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -14,8 +14,9 @@  #include <linux/mount.h>  #include <linux/namei.h>  #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h>  #include <linux/kdev_t.h> -#include <linux/parser.h>  #include <linux/filter.h>  #include <linux/bpf.h>  #include <linux/bpf_trace.h> @@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = {  enum {  	OPT_MODE, -	OPT_ERR,  }; -static const match_table_t bpf_mount_tokens = { -	{ OPT_MODE, "mode=%o" }, -	{ OPT_ERR, NULL }, +static const struct fs_parameter_spec bpf_param_specs[] = { +	fsparam_u32oct	("mode",			OPT_MODE), +	{} +}; + +static const struct fs_parameter_description bpf_fs_parameters = { +	.name		= "bpf", +	.specs		= bpf_param_specs,  };  struct bpf_mount_opts {  	umode_t mode;  }; -static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)  { -	substring_t args[MAX_OPT_ARGS]; -	int option, token; -	char *ptr; +	struct bpf_mount_opts *opts = fc->fs_private; +	struct fs_parse_result result; +	int opt; -	opts->mode = S_IRWXUGO; - -	while ((ptr = strsep(&data, ",")) != NULL) { -		if (!*ptr) -			continue; - -		token = match_token(ptr, bpf_mount_tokens, args); -		switch (token) { -		case OPT_MODE: -			if (match_octal(&args[0], &option)) -				return -EINVAL; -			opts->mode = option & S_IALLUGO; -			break; +	opt = fs_parse(fc, &bpf_fs_parameters, param, &result); +	if (opt < 0)  		/* We might like to report bad mount options here, but  		 * traditionally we've ignored all mount options, so we'd  		 * better continue to ignore non-existing options for bpf.  		 */ -		} +		return opt == -ENOPARAM ? 0 : opt; + +	switch (opt) { +	case OPT_MODE: +		opts->mode = result.uint_32 & S_IALLUGO; +		break;  	}  	return 0;  } -static int bpf_fill_super(struct super_block *sb, void *data, int silent) +static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)  {  	static const struct tree_descr bpf_rfiles[] = { { "" } }; -	struct bpf_mount_opts opts; +	struct bpf_mount_opts *opts = fc->fs_private;  	struct inode *inode;  	int ret; -	ret = bpf_parse_options(data, &opts); -	if (ret) -		return ret; -  	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);  	if (ret)  		return ret; @@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)  	inode = sb->s_root->d_inode;  	inode->i_op = &bpf_dir_iops;  	inode->i_mode &= ~S_IALLUGO; -	inode->i_mode |= S_ISVTX | opts.mode; +	inode->i_mode |= S_ISVTX | opts->mode;  	return 0;  } -static struct dentry *bpf_mount(struct file_system_type *type, int flags, -				const char *dev_name, void *data) +static int bpf_get_tree(struct fs_context *fc) +{ +	return get_tree_nodev(fc, bpf_fill_super); +} + +static void bpf_free_fc(struct fs_context *fc)  { -	return mount_nodev(type, flags, data, bpf_fill_super); +	kfree(fc->fs_private); +} + +static const struct fs_context_operations bpf_context_ops = { +	.free		= bpf_free_fc, +	.parse_param	= bpf_parse_param, +	.get_tree	= bpf_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int bpf_init_fs_context(struct fs_context *fc) +{ +	struct bpf_mount_opts *opts; + +	opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); +	if (!opts) +		return -ENOMEM; + +	opts->mode = S_IRWXUGO; + +	fc->fs_private = opts; +	fc->ops = &bpf_context_ops; +	return 0;  }  static struct file_system_type bpf_fs_type = {  	.owner		= THIS_MODULE,  	.name		= "bpf", -	.mount		= bpf_mount, +	.init_fs_context = bpf_init_fs_context, +	.parameters	= &bpf_fs_parameters,  	.kill_sb	= kill_litter_super,  }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 272071e9112f..82eabd4e38ad 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)  }  /* map_idr_lock should have been held */ -static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, -					    bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, +					      bool uref)  {  	int refold; @@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,  	return map;  } +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +{ +	spin_lock_bh(&map_idr_lock); +	map = __bpf_map_inc_not_zero(map, uref); +	spin_unlock_bh(&map_idr_lock); + +	return map; +} +EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); +  int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)  {  	return -ENOTSUPP; @@ -1619,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)  	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |  				 BPF_F_ANY_ALIGNMENT | +				 BPF_F_TEST_STATE_FREQ |  				 BPF_F_TEST_RND_HI32))  		return -EINVAL; @@ -2183,7 +2194,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)  	spin_lock_bh(&map_idr_lock);  	map = idr_find(&map_idr, id);  	if (map) -		map = bpf_map_inc_not_zero(map, true); +		map = __bpf_map_inc_not_zero(map, true);  	else  		map = ERR_PTR(-ENOENT);  	spin_unlock_bh(&map_idr_lock); @@ -2880,6 +2891,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  		err = bpf_obj_get_next_id(&attr, uattr,  					  &map_idr, &map_idr_lock);  		break; +	case BPF_BTF_GET_NEXT_ID: +		err = bpf_obj_get_next_id(&attr, uattr, +					  &btf_idr, &btf_idr_lock); +		break;  	case BPF_PROG_GET_FD_BY_ID:  		err = bpf_prog_get_fd_by_id(&attr);  		break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c new file mode 100644 index 000000000000..7ae5dddd1fe6 --- /dev/null +++ b/kernel/bpf/sysfs_btf.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel BTF information for introspection and use by eBPF tools. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/init.h> +#include <linux/sysfs.h> + +/* See scripts/link-vmlinux.sh, gen_btf() func for details */ +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; + +static ssize_t +btf_vmlinux_read(struct file *file, struct kobject *kobj, +		 struct bin_attribute *bin_attr, +		 char *buf, loff_t off, size_t len) +{ +	memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); +	return len; +} + +static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { +	.attr = { .name = "vmlinux", .mode = 0444, }, +	.read = btf_vmlinux_read, +}; + +static struct kobject *btf_kobj; + +static int __init btf_vmlinux_init(void) +{ +	if (!_binary__btf_vmlinux_bin_start) +		return 0; + +	btf_kobj = kobject_create_and_add("btf", kernel_kobj); +	if (!btf_kobj) +		return -ENOMEM; + +	bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - +				    _binary__btf_vmlinux_bin_start; + +	return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); +} + +subsys_initcall(btf_vmlinux_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b5c14c9d7b98..ffc3e53f5300 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1772,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,  		bitmap_from_u64(mask, stack_mask);  		for_each_set_bit(i, mask, 64) {  			if (i >= func->allocated_stack / BPF_REG_SIZE) { -				/* This can happen if backtracking -				 * is propagating stack precision where -				 * caller has larger stack frame -				 * than callee, but backtrack_insn() should -				 * have returned -ENOTSUPP. +				/* the sequence of instructions: +				 * 2: (bf) r3 = r10 +				 * 3: (7b) *(u64 *)(r3 -8) = r0 +				 * 4: (79) r4 = *(u64 *)(r10 -8) +				 * doesn't contain jmps. It's backtracked +				 * as a single block. +				 * During backtracking insn 3 is not recognized as +				 * stack access, so at the end of backtracking +				 * stack slot fp-8 is still marked in stack_mask. +				 * However the parent state may not have accessed +				 * fp-8 and it's "unallocated" stack space. +				 * In such case fallback to conservative.  				 */ -				verbose(env, "BUG spi %d stack_size %d\n", -					i, func->allocated_stack); -				WARN_ONCE(1, "verifier backtracking bug"); -				return -EFAULT; +				mark_all_scalars_precise(env, st); +				return 0;  			}  			if (func->stack[i].slot_type[0] != STACK_SPILL) { @@ -3458,6 +3463,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  			goto error;  		break;  	case BPF_MAP_TYPE_DEVMAP: +	case BPF_MAP_TYPE_DEVMAP_HASH:  		if (func_id != BPF_FUNC_redirect_map &&  		    func_id != BPF_FUNC_map_lookup_elem)  			goto error; @@ -3540,6 +3546,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		break;  	case BPF_FUNC_redirect_map:  		if (map->map_type != BPF_MAP_TYPE_DEVMAP && +		    map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&  		    map->map_type != BPF_MAP_TYPE_CPUMAP &&  		    map->map_type != BPF_MAP_TYPE_XSKMAP)  			goto error; @@ -7221,7 +7228,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	struct bpf_verifier_state_list *sl, **pprev;  	struct bpf_verifier_state *cur = env->cur_state, *new;  	int i, j, err, states_cnt = 0; -	bool add_new_state = false; +	bool add_new_state = env->test_state_freq ? true : false;  	cur->last_insn_idx = env->prev_insn_idx;  	if (!env->insn_aux_data[insn_idx].prune_point) @@ -8617,8 +8624,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  		}  		if (is_narrower_load && size < target_size) { -			u8 shift = bpf_ctx_narrow_load_shift(off, size, -							     size_default); +			u8 shift = bpf_ctx_narrow_access_offset( +				off, size, size_default) * 8;  			if (ctx_field_size <= 4) {  				if (shift)  					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, @@ -9261,6 +9268,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  	env->allow_ptr_leaks = is_priv; +	if (is_priv) +		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; +  	ret = replace_map_fd_with_map_ptr(env);  	if (ret < 0)  		goto skip_full_check; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9bb96ace9fa1..82a1ffe15dfa 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -13,8 +13,71 @@ struct xsk_map {  	struct bpf_map map;  	struct xdp_sock **xsk_map;  	struct list_head __percpu *flush_list; +	spinlock_t lock; /* Synchronize map updates */  }; +int xsk_map_inc(struct xsk_map *map) +{ +	struct bpf_map *m = &map->map; + +	m = bpf_map_inc(m, false); +	return PTR_ERR_OR_ZERO(m); +} + +void xsk_map_put(struct xsk_map *map) +{ +	bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, +					       struct xdp_sock **map_entry) +{ +	struct xsk_map_node *node; +	int err; + +	node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); +	if (!node) +		return ERR_PTR(-ENOMEM); + +	err = xsk_map_inc(map); +	if (err) { +		kfree(node); +		return ERR_PTR(err); +	} + +	node->map = map; +	node->map_entry = map_entry; +	return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ +	xsk_map_put(node->map); +	kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ +	spin_lock_bh(&xs->map_list_lock); +	list_add_tail(&node->node, &xs->map_list); +	spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, +				struct xdp_sock **map_entry) +{ +	struct xsk_map_node *n, *tmp; + +	spin_lock_bh(&xs->map_list_lock); +	list_for_each_entry_safe(n, tmp, &xs->map_list, node) { +		if (map_entry == n->map_entry) { +			list_del(&n->node); +			xsk_map_node_free(n); +		} +	} +	spin_unlock_bh(&xs->map_list_lock); +} +  static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)  {  	struct xsk_map *m; @@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)  		return ERR_PTR(-ENOMEM);  	bpf_map_init_from_attr(&m->map, attr); +	spin_lock_init(&m->lock);  	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);  	cost += sizeof(struct list_head) * num_possible_cpus(); @@ -71,21 +135,9 @@ free_m:  static void xsk_map_free(struct bpf_map *map)  {  	struct xsk_map *m = container_of(map, struct xsk_map, map); -	int i;  	bpf_clear_redirect_map(map);  	synchronize_net(); - -	for (i = 0; i < map->max_entries; i++) { -		struct xdp_sock *xs; - -		xs = m->xsk_map[i]; -		if (!xs) -			continue; - -		sock_put((struct sock *)xs); -	} -  	free_percpu(m->flush_list);  	bpf_map_area_free(m->xsk_map);  	kfree(m); @@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,  			       u64 map_flags)  {  	struct xsk_map *m = container_of(map, struct xsk_map, map); +	struct xdp_sock *xs, *old_xs, **map_entry;  	u32 i = *(u32 *)key, fd = *(u32 *)value; -	struct xdp_sock *xs, *old_xs; +	struct xsk_map_node *node;  	struct socket *sock;  	int err; @@ -173,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,  		return -EINVAL;  	if (unlikely(i >= m->map.max_entries))  		return -E2BIG; -	if (unlikely(map_flags == BPF_NOEXIST)) -		return -EEXIST;  	sock = sockfd_lookup(fd, &err);  	if (!sock) @@ -192,32 +243,70 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,  		return -EOPNOTSUPP;  	} -	sock_hold(sock->sk); +	map_entry = &m->xsk_map[i]; +	node = xsk_map_node_alloc(m, map_entry); +	if (IS_ERR(node)) { +		sockfd_put(sock); +		return PTR_ERR(node); +	} -	old_xs = xchg(&m->xsk_map[i], xs); +	spin_lock_bh(&m->lock); +	old_xs = READ_ONCE(*map_entry); +	if (old_xs == xs) { +		err = 0; +		goto out; +	} else if (old_xs && map_flags == BPF_NOEXIST) { +		err = -EEXIST; +		goto out; +	} else if (!old_xs && map_flags == BPF_EXIST) { +		err = -ENOENT; +		goto out; +	} +	xsk_map_sock_add(xs, node); +	WRITE_ONCE(*map_entry, xs);  	if (old_xs) -		sock_put((struct sock *)old_xs); - +		xsk_map_sock_delete(old_xs, map_entry); +	spin_unlock_bh(&m->lock);  	sockfd_put(sock);  	return 0; + +out: +	spin_unlock_bh(&m->lock); +	sockfd_put(sock); +	xsk_map_node_free(node); +	return err;  }  static int xsk_map_delete_elem(struct bpf_map *map, void *key)  {  	struct xsk_map *m = container_of(map, struct xsk_map, map); -	struct xdp_sock *old_xs; +	struct xdp_sock *old_xs, **map_entry;  	int k = *(u32 *)key;  	if (k >= map->max_entries)  		return -EINVAL; -	old_xs = xchg(&m->xsk_map[k], NULL); +	spin_lock_bh(&m->lock); +	map_entry = &m->xsk_map[k]; +	old_xs = xchg(map_entry, NULL);  	if (old_xs) -		sock_put((struct sock *)old_xs); +		xsk_map_sock_delete(old_xs, map_entry); +	spin_unlock_bh(&m->lock);  	return 0;  } +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, +			     struct xdp_sock **map_entry) +{ +	spin_lock_bh(&map->lock); +	if (READ_ONCE(*map_entry) == xs) { +		WRITE_ONCE(*map_entry, NULL); +		xsk_map_sock_delete(xs, map_entry); +	} +	spin_unlock_bh(&map->lock); +} +  const struct bpf_map_ops xsk_map_ops = {  	.map_alloc = xsk_map_alloc,  	.map_free = xsk_map_free, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 88006be40ea3..7f83f4121d8d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -194,25 +194,6 @@ struct cgroup_pidlist {  };  /* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ -	if (PIDLIST_TOO_LARGE(count)) -		return vmalloc(array_size(count, sizeof(pid_t))); -	else -		return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ -	kvfree(p); -} - -/*   * Used to destroy all pidlists lingering waiting for destroy timer.  None   * should be left afterwards.   */ @@ -244,7 +225,7 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)  	 */  	if (!delayed_work_pending(dwork)) {  		list_del(&l->links); -		pidlist_free(l->list); +		kvfree(l->list);  		put_pid_ns(l->key.ns);  		tofree = l;  	} @@ -365,7 +346,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,  	 * show up until sometime later on.  	 */  	length = cgroup_task_count(cgrp); -	array = pidlist_allocate(length); +	array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);  	if (!array)  		return -ENOMEM;  	/* now, populate the array */ @@ -390,12 +371,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,  	l = cgroup_pidlist_find_create(cgrp, type);  	if (!l) { -		pidlist_free(array); +		kvfree(array);  		return -ENOMEM;  	}  	/* store array, freeing old if necessary */ -	pidlist_free(l->list); +	kvfree(l->list);  	l->list = array;  	l->length = length;  	*lp = l; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..080561bb8a4b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -488,7 +488,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,  	rcu_read_lock();  	css = cgroup_css(cgrp, ss); -	if (!css || !css_tryget_online(css)) +	if (css && !css_tryget_online(css))  		css = NULL;  	rcu_read_unlock(); @@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc)   */  static bool use_task_css_set_links __read_mostly; -static void cgroup_enable_task_cg_lists(void) +void cgroup_enable_task_cg_lists(void)  {  	struct task_struct *p, *g; @@ -2894,7 +2894,7 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)  	do_each_subsys_mask(ss, ssid, ss_mask) {  		if (printed)  			seq_putc(seq, ' '); -		seq_printf(seq, "%s", ss->name); +		seq_puts(seq, ss->name);  		printed = true;  	} while_each_subsys_mask();  	if (printed) @@ -5255,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent)  	 * if the parent has to be frozen, the child has too.  	 */  	cgrp->freezer.e_freeze = parent->freezer.e_freeze; -	if (cgrp->freezer.e_freeze) +	if (cgrp->freezer.e_freeze) { +		/* +		 * Set the CGRP_FREEZE flag, so when a process will be +		 * attached to the child cgroup, it will become frozen. +		 * At this point the new cgroup is unpopulated, so we can +		 * consider it frozen immediately. +		 */ +		set_bit(CGRP_FREEZE, &cgrp->flags);  		set_bit(CGRP_FROZEN, &cgrp->flags); +	}  	spin_lock_irq(&css_set_lock);  	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5aa37531ce76..c52bc91f882b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -45,6 +45,7 @@  #include <linux/proc_fs.h>  #include <linux/rcupdate.h>  #include <linux/sched.h> +#include <linux/sched/deadline.h>  #include <linux/sched/mm.h>  #include <linux/sched/task.h>  #include <linux/seq_file.h> @@ -332,7 +333,18 @@ static struct cpuset top_cpuset = {   * guidelines for accessing subsystem state in kernel/cgroup.c   */ -static DEFINE_MUTEX(cpuset_mutex); +DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); + +void cpuset_read_lock(void) +{ +	percpu_down_read(&cpuset_rwsem); +} + +void cpuset_read_unlock(void) +{ +	percpu_up_read(&cpuset_rwsem); +} +  static DEFINE_SPINLOCK(callback_lock);  static struct workqueue_struct *cpuset_migrate_mm_wq; @@ -894,6 +906,67 @@ done:  	return ndoms;  } +static void update_tasks_root_domain(struct cpuset *cs) +{ +	struct css_task_iter it; +	struct task_struct *task; + +	css_task_iter_start(&cs->css, 0, &it); + +	while ((task = css_task_iter_next(&it))) +		dl_add_task_root_domain(task); + +	css_task_iter_end(&it); +} + +static void rebuild_root_domains(void) +{ +	struct cpuset *cs = NULL; +	struct cgroup_subsys_state *pos_css; + +	percpu_rwsem_assert_held(&cpuset_rwsem); +	lockdep_assert_cpus_held(); +	lockdep_assert_held(&sched_domains_mutex); + +	cgroup_enable_task_cg_lists(); + +	rcu_read_lock(); + +	/* +	 * Clear default root domain DL accounting, it will be computed again +	 * if a task belongs to it. +	 */ +	dl_clear_root_domain(&def_root_domain); + +	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + +		if (cpumask_empty(cs->effective_cpus)) { +			pos_css = css_rightmost_descendant(pos_css); +			continue; +		} + +		css_get(&cs->css); + +		rcu_read_unlock(); + +		update_tasks_root_domain(cs); + +		rcu_read_lock(); +		css_put(&cs->css); +	} +	rcu_read_unlock(); +} + +static void +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], +				    struct sched_domain_attr *dattr_new) +{ +	mutex_lock(&sched_domains_mutex); +	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); +	rebuild_root_domains(); +	mutex_unlock(&sched_domains_mutex); +} +  /*   * Rebuild scheduler domains.   * @@ -911,8 +984,8 @@ static void rebuild_sched_domains_locked(void)  	cpumask_var_t *doms;  	int ndoms; -	lockdep_assert_held(&cpuset_mutex); -	get_online_cpus(); +	lockdep_assert_cpus_held(); +	percpu_rwsem_assert_held(&cpuset_rwsem);  	/*  	 * We have raced with CPU hotplug. Don't do anything to avoid @@ -921,19 +994,17 @@ static void rebuild_sched_domains_locked(void)  	 */  	if (!top_cpuset.nr_subparts_cpus &&  	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) -		goto out; +		return;  	if (top_cpuset.nr_subparts_cpus &&  	   !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) -		goto out; +		return;  	/* Generate domain masks and attrs */  	ndoms = generate_sched_domains(&doms, &attr);  	/* Have scheduler rebuild the domains */ -	partition_sched_domains(ndoms, doms, attr); -out: -	put_online_cpus(); +	partition_and_rebuild_sched_domains(ndoms, doms, attr);  }  #else /* !CONFIG_SMP */  static void rebuild_sched_domains_locked(void) @@ -943,9 +1014,11 @@ static void rebuild_sched_domains_locked(void)  void rebuild_sched_domains(void)  { -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	rebuild_sched_domains_locked(); -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  }  /** @@ -1051,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,  	int deleting;	/* Moving cpus from subparts_cpus to effective_cpus */  	bool part_error = false;	/* Partition error? */ -	lockdep_assert_held(&cpuset_mutex); +	percpu_rwsem_assert_held(&cpuset_rwsem);  	/*  	 * The parent must be a partition root. @@ -2039,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)  	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));  	cs = css_cs(css); -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  	/* allow moving tasks into an empty cpuset if on default hierarchy */  	ret = -ENOSPC; @@ -2063,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)  	cs->attach_in_progress++;  	ret = 0;  out_unlock: -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  	return ret;  } @@ -2073,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)  	cgroup_taskset_first(tset, &css); -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  	css_cs(css)->attach_in_progress--; -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  }  /* @@ -2098,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)  	cgroup_taskset_first(tset, &css);  	cs = css_cs(css); -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  	/* prepare for attach */  	if (cs == &top_cpuset) @@ -2152,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)  	if (!cs->attach_in_progress)  		wake_up(&cpuset_attach_wq); -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  }  /* The various types of files and directories in a cpuset file system */ @@ -2183,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,  	cpuset_filetype_t type = cft->private;  	int retval = 0; -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	if (!is_cpuset_online(cs)) {  		retval = -ENODEV;  		goto out_unlock; @@ -2219,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,  		break;  	}  out_unlock: -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  	return retval;  } @@ -2230,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,  	cpuset_filetype_t type = cft->private;  	int retval = -ENODEV; -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	if (!is_cpuset_online(cs))  		goto out_unlock; @@ -2243,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,  		break;  	}  out_unlock: -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  	return retval;  } @@ -2282,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,  	kernfs_break_active_protection(of->kn);  	flush_work(&cpuset_hotplug_work); -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	if (!is_cpuset_online(cs))  		goto out_unlock; @@ -2306,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,  	free_cpuset(trialcs);  out_unlock: -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  	kernfs_unbreak_active_protection(of->kn);  	css_put(&cs->css);  	flush_workqueue(cpuset_migrate_mm_wq); @@ -2437,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,  		return -EINVAL;  	css_get(&cs->css); -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	if (!is_cpuset_online(cs))  		goto out_unlock;  	retval = update_prstate(cs, val);  out_unlock: -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  	css_put(&cs->css);  	return retval ?: nbytes;  } @@ -2649,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  	if (!parent)  		return 0; -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	set_bit(CS_ONLINE, &cs->flags);  	if (is_spread_page(parent)) @@ -2700,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);  	spin_unlock_irq(&callback_lock);  out_unlock: -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  	return 0;  } @@ -2719,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)  {  	struct cpuset *cs = css_cs(css); -	mutex_lock(&cpuset_mutex); +	get_online_cpus(); +	percpu_down_write(&cpuset_rwsem);  	if (is_partition_root(cs))  		update_prstate(cs, 0); @@ -2738,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)  	cpuset_dec();  	clear_bit(CS_ONLINE, &cs->flags); -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem); +	put_online_cpus();  }  static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2750,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)  static void cpuset_bind(struct cgroup_subsys_state *root_css)  { -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  	spin_lock_irq(&callback_lock);  	if (is_in_v2_mode()) { @@ -2763,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)  	}  	spin_unlock_irq(&callback_lock); -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  }  /* @@ -2805,6 +2890,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {  int __init cpuset_init(void)  { +	BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); +  	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));  	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));  	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -2876,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,  	is_empty = cpumask_empty(cs->cpus_allowed) ||  		   nodes_empty(cs->mems_allowed); -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  	/*  	 * Move tasks to the nearest ancestor with execution resources, @@ -2886,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,  	if (is_empty)  		remove_tasks_in_empty_cpuset(cs); -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  }  static void @@ -2936,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)  retry:  	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  	/*  	 * We have raced with task attaching. We wait until attaching  	 * is finished, so we won't attach a task to an empty cpuset.  	 */  	if (cs->attach_in_progress) { -		mutex_unlock(&cpuset_mutex); +		percpu_up_write(&cpuset_rwsem);  		goto retry;  	} @@ -3011,7 +3098,7 @@ update_tasks:  		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,  					    cpus_updated, mems_updated); -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  }  /** @@ -3041,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	if (on_dfl && !alloc_cpumasks(NULL, &tmp))  		ptmp = &tmp; -	mutex_lock(&cpuset_mutex); +	percpu_down_write(&cpuset_rwsem);  	/* fetch the available cpus/mems and find out which changed how */  	cpumask_copy(&new_cpus, cpu_active_mask); @@ -3091,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  		update_tasks_nodemask(&top_cpuset);  	} -	mutex_unlock(&cpuset_mutex); +	percpu_up_write(&cpuset_rwsem);  	/* if cpus or mems changed, we need to propagate to descendants */  	if (cpus_updated || mems_updated) { diff --git a/kernel/cpu.c b/kernel/cpu.c index e84c0873559e..fc28e17940e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -62,7 +62,6 @@ struct cpuhp_cpu_state {  	bool			rollback;  	bool			single;  	bool			bringup; -	bool			booted_once;  	struct hlist_node	*node;  	struct hlist_node	*last;  	enum cpuhp_state	cb_state; @@ -76,6 +75,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {  	.fail = CPUHP_INVALID,  }; +#ifdef CONFIG_SMP +cpumask_t cpus_booted_once_mask; +#endif +  #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)  static struct lockdep_map cpuhp_state_up_map =  	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); @@ -389,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;  void __init cpu_smt_disable(bool force)  { -	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || -		cpu_smt_control == CPU_SMT_NOT_SUPPORTED) +	if (!cpu_smt_possible())  		return;  	if (force) { @@ -433,8 +435,16 @@ static inline bool cpu_smt_allowed(unsigned int cpu)  	 * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any  	 * core will shutdown the machine.  	 */ -	return !per_cpu(cpuhp_state, cpu).booted_once; +	return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); +} + +/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +bool cpu_smt_possible(void) +{ +	return cpu_smt_control != CPU_SMT_FORCE_DISABLED && +		cpu_smt_control != CPU_SMT_NOT_SUPPORTED;  } +EXPORT_SYMBOL_GPL(cpu_smt_possible);  #else  static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }  #endif @@ -1066,7 +1076,7 @@ void notify_cpu_starting(unsigned int cpu)  	int ret;  	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */ -	st->booted_once = true; +	cpumask_set_cpu(cpu, &cpus_booted_once_mask);  	while (st->state < target) {  		st->state++;  		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -2295,6 +2305,9 @@ EXPORT_SYMBOL(__cpu_present_mask);  struct cpumask __cpu_active_mask __read_mostly;  EXPORT_SYMBOL(__cpu_active_mask); +atomic_t __num_online_cpus __read_mostly; +EXPORT_SYMBOL(__num_online_cpus); +  void init_cpu_present(const struct cpumask *src)  {  	cpumask_copy(&__cpu_present_mask, src); @@ -2310,6 +2323,27 @@ void init_cpu_online(const struct cpumask *src)  	cpumask_copy(&__cpu_online_mask, src);  } +void set_cpu_online(unsigned int cpu, bool online) +{ +	/* +	 * atomic_inc/dec() is required to handle the horrid abuse of this +	 * function by the reboot and kexec code which invoke it from +	 * IPI/NMI broadcasts when shutting down CPUs. Invocation from +	 * regular CPU hotplug is properly serialized. +	 * +	 * Note, that the fact that __num_online_cpus is of type atomic_t +	 * does not protect readers which are not serialized against +	 * concurrent hotplug operations. +	 */ +	if (online) { +		if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) +			atomic_inc(&__num_online_cpus); +	} else { +		if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) +			atomic_dec(&__num_online_cpus); +	} +} +  /*   * Activate the first processor.   */ @@ -2334,7 +2368,7 @@ void __init boot_cpu_init(void)  void __init boot_cpu_hotplug_init(void)  {  #ifdef CONFIG_SMP -	this_cpu_write(cpuhp_state.booted_once, true); +	cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);  #endif  	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);  } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cc608de6883..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -787,11 +787,8 @@ out:  }  /* - * GDB places a breakpoint at this function to know dynamically - * loaded objects. It's not defined static so that only one instance with this - * name exists in the kernel. + * GDB places a breakpoint at this function to know dynamically loaded objects.   */ -  static int module_event(struct notifier_block *self, unsigned long val,  	void *data)  { @@ -896,30 +893,25 @@ static struct sysrq_key_op sysrq_dbg_op = {  };  #endif -static int kgdb_panic_event(struct notifier_block *self, -			    unsigned long val, -			    void *data) +void kgdb_panic(const char *msg)  { +	if (!kgdb_io_module_registered) +		return; +  	/* -	 * Avoid entering the debugger if we were triggered due to a panic -	 * We don't want to get stuck waiting for input from user in such case. -	 * panic_timeout indicates the system should automatically +	 * We don't want to get stuck waiting for input from user if +	 * "panic_timeout" indicates the system should automatically  	 * reboot on panic.  	 */  	if (panic_timeout) -		return NOTIFY_DONE; +		return;  	if (dbg_kdb_mode) -		kdb_printf("PANIC: %s\n", (char *)data); +		kdb_printf("PANIC: %s\n", msg); +  	kgdb_breakpoint(); -	return NOTIFY_DONE;  } -static struct notifier_block kgdb_panic_event_nb = { -       .notifier_call	= kgdb_panic_event, -       .priority	= INT_MAX, -}; -  void __weak kgdb_arch_late(void)  {  } @@ -968,8 +960,6 @@ static void kgdb_register_callbacks(void)  			kgdb_arch_late();  		register_module_notifier(&dbg_module_load_nb);  		register_reboot_notifier(&dbg_reboot_notifier); -		atomic_notifier_chain_register(&panic_notifier_list, -					       &kgdb_panic_event_nb);  #ifdef CONFIG_MAGIC_SYSRQ  		register_sysrq_key('g', &sysrq_dbg_op);  #endif @@ -983,16 +973,14 @@ static void kgdb_register_callbacks(void)  static void kgdb_unregister_callbacks(void)  {  	/* -	 * When this routine is called KGDB should unregister from the -	 * panic handler and clean up, making sure it is not handling any +	 * When this routine is called KGDB should unregister from +	 * handlers and clean up, making sure it is not handling any  	 * break exceptions at the time.  	 */  	if (kgdb_io_module_registered) {  		kgdb_io_module_registered = 0;  		unregister_reboot_notifier(&dbg_reboot_notifier);  		unregister_module_notifier(&dbg_module_load_nb); -		atomic_notifier_chain_unregister(&panic_notifier_list, -					       &kgdb_panic_event_nb);  		kgdb_arch_exit();  #ifdef CONFIG_MAGIC_SYSRQ  		unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 9ecfa37c7fbf..4567fe998c30 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -830,7 +830,7 @@ static void parse_grep(const char *str)  	cp++;  	while (isspace(*cp))  		cp++; -	if (strncmp(cp, "grep ", 5)) { +	if (!str_has_prefix(cp, "grep ")) {  		kdb_printf("invalid 'pipe', see grephelp\n");  		return;  	} diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9decbba255fc..73c5c2b8e824 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -20,6 +20,15 @@ config ARCH_HAS_DMA_COHERENCE_H  config ARCH_HAS_DMA_SET_MASK  	bool +# +# Select this option if the architecture needs special handling for +# DMA_ATTR_WRITE_COMBINE.  Normally the "uncached" mapping should be what +# people thing of when saying write combine, so very few platforms should +# need to enable this. +# +config ARCH_HAS_DMA_WRITE_COMBINE +	bool +  config DMA_DECLARE_COHERENT  	bool @@ -45,9 +54,6 @@ config ARCH_HAS_DMA_PREP_COHERENT  config ARCH_HAS_DMA_COHERENT_TO_PFN  	bool -config ARCH_HAS_DMA_MMAP_PGPROT -	bool -  config ARCH_HAS_FORCE_DMA_UNENCRYPTED  	bool diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 29fd6590dc1e..545e3869b0e3 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -122,18 +122,6 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,  		dma_release_coherent_memory(mem);  	return ret;  } -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ -	struct dma_coherent_mem *mem = dev->dma_mem; - -	if (!mem) -		return; -	dma_release_coherent_memory(mem); -	dev->dma_mem = NULL; -} -EXPORT_SYMBOL(dma_release_declared_memory);  static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,  		ssize_t size, dma_addr_t *dma_handle) @@ -288,7 +276,6 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,  	return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);  } -EXPORT_SYMBOL(dma_mmap_from_dev_coherent);  int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,  				   size_t size, int *ret) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 706113c6bebc..8402b29c280f 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -305,7 +305,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,  		dma_direct_sync_single_for_cpu(dev, addr, size, dir);  	if (unlikely(is_swiotlb_buffer(phys))) -		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); +		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);  }  EXPORT_SYMBOL(dma_direct_unmap_page); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b0038ca3aa92..d9334f31a5af 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -136,17 +136,29 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,  	return ret;  } +/* + * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems + * that the intention is to allow exporting memory allocated via the + * coherent DMA APIs through the dma_buf API, which only accepts a + * scattertable.  This presents a couple of problems: + * 1. Not all memory allocated via the coherent DMA APIs is backed by + *    a struct page + * 2. Passing coherent DMA memory into the streaming APIs is not allowed + *    as we will try to flush the memory through a different alias to that + *    actually being used (and the flushes are redundant.) + */  int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,  		void *cpu_addr, dma_addr_t dma_addr, size_t size,  		unsigned long attrs)  {  	const struct dma_map_ops *ops = get_dma_ops(dev); -	if (!dma_is_direct(ops) && ops->get_sgtable) -		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, -					attrs); -	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, -			attrs); +	if (dma_is_direct(ops)) +		return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, +				size, attrs); +	if (!ops->get_sgtable) +		return -ENXIO; +	return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);  }  EXPORT_SYMBOL(dma_get_sgtable_attrs); @@ -161,9 +173,11 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)  	    (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&               (attrs & DMA_ATTR_NON_CONSISTENT)))  		return prot; -	if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) -		return arch_dma_mmap_pgprot(dev, prot, attrs); -	return pgprot_noncached(prot); +#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE +	if (attrs & DMA_ATTR_WRITE_COMBINE) +		return pgprot_writecombine(prot); +#endif +	return pgprot_dmacoherent(prot);  }  #endif /* CONFIG_MMU */ @@ -174,7 +188,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,  		void *cpu_addr, dma_addr_t dma_addr, size_t size,  		unsigned long attrs)  { -#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP +#ifdef CONFIG_MMU  	unsigned long user_count = vma_pages(vma);  	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;  	unsigned long off = vma->vm_pgoff; @@ -205,8 +219,29 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,  			user_count << PAGE_SHIFT, vma->vm_page_prot);  #else  	return -ENXIO; -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ +#endif /* CONFIG_MMU */ +} + +/** + * dma_can_mmap - check if a given device supports dma_mmap_* + * @dev: device to check + * + * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to + * map DMA allocations to userspace. + */ +bool dma_can_mmap(struct device *dev) +{ +	const struct dma_map_ops *ops = get_dma_ops(dev); + +	if (dma_is_direct(ops)) { +		return IS_ENABLED(CONFIG_MMU) && +		       (dev_is_dma_coherent(dev) || +			IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)); +	} + +	return ops->mmap != NULL;  } +EXPORT_SYMBOL_GPL(dma_can_mmap);  /**   * dma_mmap_attrs - map a coherent DMA allocation into user space @@ -227,31 +262,15 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,  {  	const struct dma_map_ops *ops = get_dma_ops(dev); -	if (!dma_is_direct(ops) && ops->mmap) -		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); -	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); +	if (dma_is_direct(ops)) +		return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, +				attrs); +	if (!ops->mmap) +		return -ENXIO; +	return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);  }  EXPORT_SYMBOL(dma_mmap_attrs); -static u64 dma_default_get_required_mask(struct device *dev) -{ -	u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); -	u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT)); -	u64 mask; - -	if (!high_totalram) { -		/* convert to mask just covering totalram */ -		low_totalram = (1 << (fls(low_totalram) - 1)); -		low_totalram += low_totalram - 1; -		mask = low_totalram; -	} else { -		high_totalram = (1 << (fls(high_totalram) - 1)); -		high_totalram += high_totalram - 1; -		mask = (((u64)high_totalram) << 32) + 0xffffffff; -	} -	return mask; -} -  u64 dma_get_required_mask(struct device *dev)  {  	const struct dma_map_ops *ops = get_dma_ops(dev); @@ -260,7 +279,16 @@ u64 dma_get_required_mask(struct device *dev)  		return dma_direct_get_required_mask(dev);  	if (ops->get_required_mask)  		return ops->get_required_mask(dev); -	return dma_default_get_required_mask(dev); + +	/* +	 * We require every DMA ops implementation to at least support a 32-bit +	 * DMA mask (and use bounce buffering if that isn't supported in +	 * hardware).  As the direct mapping code has its own routine to +	 * actually report an optimal mask we default to 32-bit here as that +	 * is the right thing for most IOMMUs, and at least not actively +	 * harmful in general. +	 */ +	return DMA_BIT_MASK(32);  }  EXPORT_SYMBOL_GPL(dma_get_required_mask); @@ -317,12 +345,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,  }  EXPORT_SYMBOL(dma_free_attrs); -static inline void dma_check_mask(struct device *dev, u64 mask) -{ -	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) -		dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); -} -  int dma_supported(struct device *dev, u64 mask)  {  	const struct dma_map_ops *ops = get_dma_ops(dev); @@ -353,7 +375,6 @@ int dma_set_mask(struct device *dev, u64 mask)  		return -EIO;  	arch_dma_set_mask(dev, mask); -	dma_check_mask(dev, mask);  	*dev->dma_mask = mask;  	return 0;  } @@ -371,7 +392,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)  	if (!dma_supported(dev, mask))  		return -EIO; -	dma_check_mask(dev, mask);  	dev->coherent_dma_mask = mask;  	return 0;  } @@ -405,3 +425,14 @@ size_t dma_max_mapping_size(struct device *dev)  	return size;  }  EXPORT_SYMBOL_GPL(dma_max_mapping_size); + +unsigned long dma_get_merge_boundary(struct device *dev) +{ +	const struct dma_map_ops *ops = get_dma_ops(dev); + +	if (!ops || !ops->get_merge_boundary) +		return 0;	/* can't merge */ + +	return ops->get_merge_boundary(dev); +} +EXPORT_SYMBOL_GPL(dma_get_merge_boundary); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index ffe78f0b2fe4..ca4e5d44b571 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -11,13 +11,21 @@  #include <linux/slab.h>  #include <linux/vmalloc.h> +struct page **dma_common_find_pages(void *cpu_addr) +{ +	struct vm_struct *area = find_vm_area(cpu_addr); + +	if (!area || area->flags != VM_DMA_COHERENT) +		return NULL; +	return area->pages; +} +  static struct vm_struct *__dma_common_pages_remap(struct page **pages, -			size_t size, unsigned long vm_flags, pgprot_t prot, -			const void *caller) +			size_t size, pgprot_t prot, const void *caller)  {  	struct vm_struct *area; -	area = get_vm_area_caller(size, vm_flags, caller); +	area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);  	if (!area)  		return NULL; @@ -34,12 +42,11 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages,   * Cannot be used in non-sleeping contexts   */  void *dma_common_pages_remap(struct page **pages, size_t size, -			unsigned long vm_flags, pgprot_t prot, -			const void *caller) +			 pgprot_t prot, const void *caller)  {  	struct vm_struct *area; -	area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); +	area = __dma_common_pages_remap(pages, size, prot, caller);  	if (!area)  		return NULL; @@ -53,7 +60,6 @@ void *dma_common_pages_remap(struct page **pages, size_t size,   * Cannot be used in non-sleeping contexts   */  void *dma_common_contiguous_remap(struct page *page, size_t size, -			unsigned long vm_flags,  			pgprot_t prot, const void *caller)  {  	int i; @@ -67,7 +73,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,  	for (i = 0; i < (size >> PAGE_SHIFT); i++)  		pages[i] = nth_page(page, i); -	area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); +	area = __dma_common_pages_remap(pages, size, prot, caller);  	kfree(pages); @@ -79,11 +85,11 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,  /*   * Unmaps a range previously mapped by dma_common_*_remap   */ -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +void dma_common_free_remap(void *cpu_addr, size_t size)  { -	struct vm_struct *area = find_vm_area(cpu_addr); +	struct page **pages = dma_common_find_pages(cpu_addr); -	if (!area || (area->flags & vm_flags) != vm_flags) { +	if (!pages) {  		WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);  		return;  	} @@ -105,7 +111,16 @@ static int __init early_coherent_pool(char *p)  }  early_param("coherent_pool", early_coherent_pool); -int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) +static gfp_t dma_atomic_pool_gfp(void) +{ +	if (IS_ENABLED(CONFIG_ZONE_DMA)) +		return GFP_DMA; +	if (IS_ENABLED(CONFIG_ZONE_DMA32)) +		return GFP_DMA32; +	return GFP_KERNEL; +} + +static int __init dma_atomic_pool_init(void)  {  	unsigned int pool_size_order = get_order(atomic_pool_size);  	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; @@ -117,7 +132,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)  		page = dma_alloc_from_contiguous(NULL, nr_pages,  						 pool_size_order, false);  	else -		page = alloc_pages(gfp, pool_size_order); +		page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);  	if (!page)  		goto out; @@ -127,8 +142,9 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)  	if (!atomic_pool)  		goto free_page; -	addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP, -					   prot, __builtin_return_address(0)); +	addr = dma_common_contiguous_remap(page, atomic_pool_size, +					   pgprot_dmacoherent(PAGE_KERNEL), +					   __builtin_return_address(0));  	if (!addr)  		goto destroy_genpool; @@ -143,7 +159,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)  	return 0;  remove_mapping: -	dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); +	dma_common_free_remap(addr, atomic_pool_size);  destroy_genpool:  	gen_pool_destroy(atomic_pool);  	atomic_pool = NULL; @@ -155,6 +171,7 @@ out:  		atomic_pool_size / 1024);  	return -ENOMEM;  } +postcore_initcall(dma_atomic_pool_init);  bool dma_in_atomic_pool(void *start, size_t size)  { @@ -217,7 +234,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,  	arch_dma_prep_coherent(page, size);  	/* create a coherent mapping */ -	ret = dma_common_contiguous_remap(page, size, VM_USERMAP, +	ret = dma_common_contiguous_remap(page, size,  			dma_pgprot(dev, PAGE_KERNEL, attrs),  			__builtin_return_address(0));  	if (!ret) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9de232229063..673a2cdb2656 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,  phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,  				   dma_addr_t tbl_dma_addr, -				   phys_addr_t orig_addr, size_t size, +				   phys_addr_t orig_addr, +				   size_t mapping_size, +				   size_t alloc_size,  				   enum dma_data_direction dir,  				   unsigned long attrs)  { @@ -461,8 +463,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,  		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");  	if (mem_encrypt_active()) -		pr_warn_once("%s is active and system is using DMA bounce buffers\n", -			     sme_active() ? "SME" : "SEV"); +		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); + +	if (mapping_size > alloc_size) { +		dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", +			      mapping_size, alloc_size); +		return (phys_addr_t)DMA_MAPPING_ERROR; +	}  	mask = dma_get_seg_boundary(hwdev); @@ -471,8 +478,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,  	offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;  	/* - 	 * Carefully handle integer overflow which can occur when mask == ~0UL. - 	 */ +	 * Carefully handle integer overflow which can occur when mask == ~0UL. +	 */  	max_slots = mask + 1  		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT  		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); @@ -481,8 +488,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,  	 * For mappings greater than or equal to a page, we limit the stride  	 * (and hence alignment) to a page size.  	 */ -	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; -	if (size >= PAGE_SIZE) +	nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +	if (alloc_size >= PAGE_SIZE)  		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));  	else  		stride = 1; @@ -547,7 +554,7 @@ not_found:  	spin_unlock_irqrestore(&io_tlb_lock, flags);  	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())  		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", -			 size, io_tlb_nslabs, tmp_io_tlb_used); +			 alloc_size, io_tlb_nslabs, tmp_io_tlb_used);  	return (phys_addr_t)DMA_MAPPING_ERROR;  found:  	io_tlb_used += nslots; @@ -562,7 +569,7 @@ found:  		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);  	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&  	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) -		swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); +		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);  	return tlb_addr;  } @@ -571,11 +578,11 @@ found:   * tlb_addr is the physical address of the bounce buffer to unmap.   */  void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, -			      size_t size, enum dma_data_direction dir, -			      unsigned long attrs) +			      size_t mapping_size, size_t alloc_size, +			      enum dma_data_direction dir, unsigned long attrs)  {  	unsigned long flags; -	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +	int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;  	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;  	phys_addr_t orig_addr = io_tlb_orig_addr[index]; @@ -585,7 +592,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,  	if (orig_addr != INVALID_PHYS_ADDR &&  	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&  	    ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) -		swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); +		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);  	/*  	 * Return the buffer to the free list by setting the corresponding @@ -665,14 +672,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,  	/* Oh well, have to allocate and map a bounce buffer. */  	*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), -			*phys, size, dir, attrs); +			*phys, size, size, dir, attrs);  	if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)  		return false;  	/* Ensure that the address returned is DMA'ble */  	*dma_addr = __phys_to_dma(dev, *phys);  	if (unlikely(!dma_capable(dev, *dma_addr, size))) { -		swiotlb_tbl_unmap_single(dev, *phys, size, dir, +		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,  			attrs | DMA_ATTR_SKIP_CPU_SYNC);  		return false;  	} diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@  #include <linux/fs.h>  #include <linux/mm.h>  #include <linux/binfmts.h> +#include <linux/elfcore.h>  Elf_Half __weak elf_core_extra_phdrs(void)  { diff --git a/kernel/events/core.c b/kernel/events/core.c index 0463c1151bae..4655adbbae10 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1103,7 +1103,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)  	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);  	raw_spin_lock_init(&cpuctx->hrtimer_lock); -	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); +	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);  	timer->function = perf_mux_hrtimer_handler;  } @@ -1121,7 +1121,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)  	if (!cpuctx->hrtimer_active) {  		cpuctx->hrtimer_active = 1;  		hrtimer_forward_now(timer, cpuctx->hrtimer_interval); -		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); +		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);  	}  	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); @@ -1887,6 +1887,89 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)  	ctx->generation++;  } +static int +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) +{ +	if (!has_aux(aux_event)) +		return 0; + +	if (!event->pmu->aux_output_match) +		return 0; + +	return event->pmu->aux_output_match(aux_event); +} + +static void put_event(struct perf_event *event); +static void event_sched_out(struct perf_event *event, +			    struct perf_cpu_context *cpuctx, +			    struct perf_event_context *ctx); + +static void perf_put_aux_event(struct perf_event *event) +{ +	struct perf_event_context *ctx = event->ctx; +	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); +	struct perf_event *iter; + +	/* +	 * If event uses aux_event tear down the link +	 */ +	if (event->aux_event) { +		iter = event->aux_event; +		event->aux_event = NULL; +		put_event(iter); +		return; +	} + +	/* +	 * If the event is an aux_event, tear down all links to +	 * it from other events. +	 */ +	for_each_sibling_event(iter, event->group_leader) { +		if (iter->aux_event != event) +			continue; + +		iter->aux_event = NULL; +		put_event(event); + +		/* +		 * If it's ACTIVE, schedule it out and put it into ERROR +		 * state so that we don't try to schedule it again. Note +		 * that perf_event_enable() will clear the ERROR status. +		 */ +		event_sched_out(iter, cpuctx, ctx); +		perf_event_set_state(event, PERF_EVENT_STATE_ERROR); +	} +} + +static int perf_get_aux_event(struct perf_event *event, +			      struct perf_event *group_leader) +{ +	/* +	 * Our group leader must be an aux event if we want to be +	 * an aux_output. This way, the aux event will precede its +	 * aux_output events in the group, and therefore will always +	 * schedule first. +	 */ +	if (!group_leader) +		return 0; + +	if (!perf_aux_output_match(event, group_leader)) +		return 0; + +	if (!atomic_long_inc_not_zero(&group_leader->refcount)) +		return 0; + +	/* +	 * Link aux_outputs to their aux event; this is undone in +	 * perf_group_detach() by perf_put_aux_event(). When the +	 * group in torn down, the aux_output events loose their +	 * link to the aux_event and can't schedule any more. +	 */ +	event->aux_event = group_leader; + +	return 1; +} +  static void perf_group_detach(struct perf_event *event)  {  	struct perf_event *sibling, *tmp; @@ -1902,6 +1985,8 @@ static void perf_group_detach(struct perf_event *event)  	event->attach_state &= ~PERF_ATTACH_GROUP; +	perf_put_aux_event(event); +  	/*  	 * If this is a sibling, remove it from its group.  	 */ @@ -2154,7 +2239,7 @@ static void __perf_event_disable(struct perf_event *event,   *   * If event->ctx is a cloned context, callers must make sure that   * every task struct that event->ctx->task could possibly point to - * remains valid.  This condition is satisifed when called through + * remains valid.  This condition is satisfied when called through   * perf_event_for_each_child or perf_event_for_each because they   * hold the top-level event's child_mutex, so any descendant that   * goes to exit will block in perf_event_exit_event(). @@ -4089,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)  		return NULL;  	__perf_event_init_context(ctx); -	if (task) { -		ctx->task = task; -		get_task_struct(task); -	} +	if (task) +		ctx->task = get_task_struct(task);  	ctx->pmu = pmu;  	return ctx; @@ -5971,7 +6054,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,   * Get remaining task size from user stack pointer.   *   * It'd be better to take stack vma map and limit this more - * precisly, but there's no way to get it safely under interrupt, + * precisely, but there's no way to get it safely under interrupt,   * so using TASK_SIZE as limit.   */  static u64 perf_ustack_task_size(struct pt_regs *regs) @@ -6533,7 +6616,7 @@ void perf_prepare_sample(struct perf_event_header *header,  	if (sample_type & PERF_SAMPLE_STACK_USER) {  		/* -		 * Either we need PERF_SAMPLE_STACK_USER bit to be allways +		 * Either we need PERF_SAMPLE_STACK_USER bit to be always  		 * processed as the last one or have additional check added  		 * in case new sample type is added, because we could eat  		 * up the rest of the sample size. @@ -9491,7 +9574,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)  		period = max_t(u64, 10000, hwc->sample_period);  	}  	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), -		      HRTIMER_MODE_REL_PINNED); +		      HRTIMER_MODE_REL_PINNED_HARD);  }  static void perf_swevent_cancel_hrtimer(struct perf_event *event) @@ -9513,7 +9596,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)  	if (!is_sampling_event(event))  		return; -	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  	hwc->hrtimer.function = perf_swevent_hrtimer;  	/* @@ -10355,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  		 * and we cannot use the ctx information because we need the  		 * pmu before we get a ctx.  		 */ -		get_task_struct(task); -		event->hw.target = task; +		event->hw.target = get_task_struct(task);  	}  	event->clock = &local_clock; @@ -10426,6 +10508,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  		goto err_ns;  	} +	if (event->attr.aux_output && +	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { +		err = -EOPNOTSUPP; +		goto err_pmu; +	} +  	err = exclusive_event_init(event);  	if (err)  		goto err_pmu; @@ -10829,6 +10917,13 @@ SYSCALL_DEFINE5(perf_event_open,  	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))  		return -EACCES; +	err = security_locked_down(LOCKDOWN_PERF); +	if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) +		/* REGS_INTR can leak data, lockdown must prevent this */ +		return err; + +	err = 0; +  	/*  	 * In cgroup mode, the pid argument is used to pass the fd  	 * opened to the cgroup directory in cgroupfs. The cpu argument @@ -11082,6 +11177,8 @@ SYSCALL_DEFINE5(perf_event_open,  		}  	} +	if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) +		goto err_locked;  	/*  	 * Must be under the same ctx::mutex as perf_install_in_context(), diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c5cd852fe86b..3cc8416ec844 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp,  int register_perf_hw_breakpoint(struct perf_event *bp)  { -	struct arch_hw_breakpoint hw; +	struct arch_hw_breakpoint hw = { };  	int err;  	err = reserve_bp_slot(bp); @@ -461,7 +461,7 @@ int  modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,  			        bool check)  { -	struct arch_hw_breakpoint hw; +	struct arch_hw_breakpoint hw = { };  	int err;  	err = hw_breakpoint_parse(bp, attr, &hw); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@  #include <linux/percpu-rwsem.h>  #include <linux/task_work.h>  #include <linux/shmem_fs.h> +#include <linux/khugepaged.h>  #include <linux/uprobes.h> @@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)   *   * @vma:      vma that holds the pte pointing to page   * @addr:     address the old @page is mapped at - * @page:     the cowed page we are replacing by kpage - * @kpage:    the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by   * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise.   */  static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  				struct page *old_page, struct page *new_page)  {  	struct mm_struct *mm = vma->vm_mm;  	struct page_vma_mapped_walk pvmw = { -		.page = old_page, +		.page = compound_head(old_page),  		.vma = vma,  		.address = addr,  	}; @@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,  				addr + PAGE_SIZE); -	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - -	err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, -			false); -	if (err) -		return err; +	if (new_page) { +		err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, +					    &memcg, false); +		if (err) +			return err; +	}  	/* For try_to_free_swap() and munlock_vma_page() below */  	lock_page(old_page); @@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	mmu_notifier_invalidate_range_start(&range);  	err = -EAGAIN;  	if (!page_vma_mapped_walk(&pvmw)) { -		mem_cgroup_cancel_charge(new_page, memcg, false); +		if (new_page) +			mem_cgroup_cancel_charge(new_page, memcg, false);  		goto unlock;  	}  	VM_BUG_ON_PAGE(addr != pvmw.address, old_page); -	get_page(new_page); -	page_add_new_anon_rmap(new_page, vma, addr, false); -	mem_cgroup_commit_charge(new_page, memcg, false, false); -	lru_cache_add_active_or_unevictable(new_page, vma); +	if (new_page) { +		get_page(new_page); +		page_add_new_anon_rmap(new_page, vma, addr, false); +		mem_cgroup_commit_charge(new_page, memcg, false, false); +		lru_cache_add_active_or_unevictable(new_page, vma); +	} else +		/* no new page, just dec_mm_counter for old_page */ +		dec_mm_counter(mm, MM_ANONPAGES);  	if (!PageAnon(old_page)) {  		dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));  	ptep_clear_flush_notify(vma, addr, pvmw.pte); -	set_pte_at_notify(mm, addr, pvmw.pte, -			mk_pte(new_page, vma->vm_page_prot)); +	if (new_page) +		set_pte_at_notify(mm, addr, pvmw.pte, +				  mk_pte(new_page, vma->vm_page_prot));  	page_remove_rmap(old_page, false);  	if (!page_mapped(old_page)) @@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,  	struct page *old_page, *new_page;  	struct vm_area_struct *vma;  	int ret, is_register, ref_ctr_updated = 0; +	bool orig_page_huge = false;  	is_register = is_swbp_insn(&opcode);  	uprobe = container_of(auprobe, struct uprobe, arch); @@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,  retry:  	/* Read the page with vaddr into memory */  	ret = get_user_pages_remote(NULL, mm, vaddr, 1, -			FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); +			FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);  	if (ret <= 0)  		return ret; @@ -488,6 +498,10 @@ retry:  		ref_ctr_updated = 1;  	} +	ret = 0; +	if (!is_register && !PageAnon(old_page)) +		goto put_old; +  	ret = anon_vma_prepare(vma);  	if (ret)  		goto put_old; @@ -501,8 +515,33 @@ retry:  	copy_highpage(new_page, old_page);  	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); +	if (!is_register) { +		struct page *orig_page; +		pgoff_t index; + +		VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + +		index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; +		orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, +					  index); + +		if (orig_page) { +			if (PageUptodate(orig_page) && +			    pages_identical(new_page, orig_page)) { +				/* let go new_page */ +				put_page(new_page); +				new_page = NULL; + +				if (PageCompound(orig_page)) +					orig_page_huge = true; +			} +			put_page(orig_page); +		} +	} +  	ret = __replace_page(vma, vaddr, old_page, new_page); -	put_page(new_page); +	if (new_page) +		put_page(new_page);  put_old:  	put_page(old_page); @@ -513,6 +552,10 @@ put_old:  	if (ret && is_register && ref_ctr_updated)  		update_ref_ctr(uprobe, mm, -1); +	/* try collapse pmd for compound page */ +	if (!ret && orig_page_huge) +		collapse_pte_mapped_thp(mm, vaddr); +  	return ret;  } diff --git a/kernel/exit.c b/kernel/exit.c index 5b4a5dcce8f8..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)  	put_task_struct(tsk);  } +void put_task_struct_rcu_user(struct task_struct *task) +{ +	if (refcount_dec_and_test(&task->rcu_users)) +		call_rcu(&task->rcu, delayed_put_task_struct); +}  void release_task(struct task_struct *p)  { @@ -222,76 +227,13 @@ repeat:  	write_unlock_irq(&tasklist_lock);  	release_thread(p); -	call_rcu(&p->rcu, delayed_put_task_struct); +	put_task_struct_rcu_user(p);  	p = leader;  	if (unlikely(zap_leader))  		goto repeat;  } -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ -	struct sighand_struct *sighand; -	struct task_struct *task; - -	/* -	 * We need to verify that release_task() was not called and thus -	 * delayed_put_task_struct() can't run and drop the last reference -	 * before rcu_read_unlock(). We check task->sighand != NULL, -	 * but we can read the already freed and reused memory. -	 */ -retry: -	task = rcu_dereference(*ptask); -	if (!task) -		return NULL; - -	probe_kernel_address(&task->sighand, sighand); - -	/* -	 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task -	 * was already freed we can not miss the preceding update of this -	 * pointer. -	 */ -	smp_rmb(); -	if (unlikely(task != READ_ONCE(*ptask))) -		goto retry; - -	/* -	 * We've re-checked that "task == *ptask", now we have two different -	 * cases: -	 * -	 * 1. This is actually the same task/task_struct. In this case -	 *    sighand != NULL tells us it is still alive. -	 * -	 * 2. This is another task which got the same memory for task_struct. -	 *    We can't know this of course, and we can not trust -	 *    sighand != NULL. -	 * -	 *    In this case we actually return a random value, but this is -	 *    correct. -	 * -	 *    If we return NULL - we can pretend that we actually noticed that -	 *    *ptask was updated when the previous task has exited. Or pretend -	 *    that probe_slab_address(&sighand) reads NULL. -	 * -	 *    If we return the new task (because sighand is not NULL for any -	 *    reason) - this is fine too. This (new) task can't go away before -	 *    another gp pass. -	 * -	 *    And note: We could even eliminate the false positive if re-read -	 *    task->sighand once again to avoid the falsely NULL. But this case -	 *    is very unlikely so we don't care. -	 */ -	if (!sighand) -		return NULL; - -	return task; -} -  void rcuwait_wake_up(struct rcuwait *w)  {  	struct task_struct *task; @@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w)  	 */  	smp_mb(); /* (B) */ -	/* -	 * Avoid using task_rcu_dereference() magic as long as we are careful, -	 * see comment in rcuwait_wait_event() regarding ->exit_state. -	 */  	task = rcu_dereference(w->task);  	if (task)  		wake_up_process(task); @@ -1554,6 +1492,23 @@ end:  	return retval;  } +static struct pid *pidfd_get_pid(unsigned int fd) +{ +	struct fd f; +	struct pid *pid; + +	f = fdget(fd); +	if (!f.file) +		return ERR_PTR(-EBADF); + +	pid = pidfd_pid(f.file); +	if (!IS_ERR(pid)) +		get_pid(pid); + +	fdput(f); +	return pid; +} +  static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,  			  int options, struct rusage *ru)  { @@ -1576,19 +1531,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,  		type = PIDTYPE_PID;  		if (upid <= 0)  			return -EINVAL; + +		pid = find_get_pid(upid);  		break;  	case P_PGID:  		type = PIDTYPE_PGID; -		if (upid <= 0) +		if (upid < 0)  			return -EINVAL; + +		if (upid) +			pid = find_get_pid(upid); +		else +			pid = get_task_pid(current, PIDTYPE_PGID); +		break; +	case P_PIDFD: +		type = PIDTYPE_PID; +		if (upid < 0) +			return -EINVAL; + +		pid = pidfd_get_pid(upid); +		if (IS_ERR(pid)) +			return PTR_ERR(pid);  		break;  	default:  		return -EINVAL;  	} -	if (type < PIDTYPE_MAX) -		pid = find_get_pid(upid); -  	wo.wo_type	= type;  	wo.wo_pid	= pid;  	wo.wo_flags	= options; diff --git a/kernel/extable.c b/kernel/extable.c index e23cce6e6092..f6c9406eec7d 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -40,13 +40,20 @@ void __init sort_main_extable(void)  	}  } +/* Given an address, look for it in the kernel exception table */ +const +struct exception_table_entry *search_kernel_exception_table(unsigned long addr) +{ +	return search_extable(__start___ex_table, +			      __stop___ex_table - __start___ex_table, addr); +} +  /* Given an address, look for it in the exception tables. */  const struct exception_table_entry *search_exception_tables(unsigned long addr)  {  	const struct exception_table_entry *e; -	e = search_extable(__start___ex_table, -			   __stop___ex_table - __start___ex_table, addr); +	e = search_kernel_exception_table(addr);  	if (!e)  		e = search_module_extables(addr);  	return e; diff --git a/kernel/fork.c b/kernel/fork.c index 2852d0e76ea3..f9572f416126 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -125,6 +125,15 @@ int nr_threads;			/* The idle threads do not count.. */  static int max_threads;		/* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x)	[x] = __stringify(x) + +static const char * const resident_page_types[] = { +	NAMED_ARRAY_INDEX(MM_FILEPAGES), +	NAMED_ARRAY_INDEX(MM_ANONPAGES), +	NAMED_ARRAY_INDEX(MM_SWAPENTS), +	NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; +  DEFINE_PER_CPU(unsigned long, process_counts) = 0;  __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */ @@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm)  {  	int i; +	BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, +			 "Please make sure 'struct resident_page_types[]' is updated as well"); +  	for (i = 0; i < NR_MM_COUNTERS; i++) {  		long x = atomic_long_read(&mm->rss_stat.count[i]);  		if (unlikely(x)) -			printk(KERN_ALERT "BUG: Bad rss-counter state " -					  "mm:%p idx:%d val:%ld\n", mm, i, x); +			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", +				 mm, resident_page_types[i], x);  	}  	if (mm_pgtables_bytes(mm)) @@ -768,6 +780,7 @@ static void set_max_threads(unsigned int max_threads_suggested)  int arch_task_struct_size __read_mostly;  #endif +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR  static void task_struct_whitelist(unsigned long *offset, unsigned long *size)  {  	/* Fetch thread_struct whitelist for the architecture. */ @@ -782,6 +795,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)  	else  		*offset += offsetof(struct task_struct, thread);  } +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */  void __init fork_init(void)  { @@ -901,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  		tsk->cpus_ptr = &tsk->cpus_mask;  	/* -	 * One for us, one for whoever does the "release_task()" (usually -	 * parent) +	 * One for the user space visible state that goes away when reaped. +	 * One for the scheduler.  	 */ -	refcount_set(&tsk->usage, 2); +	refcount_set(&tsk->rcu_users, 2); +	/* One for the rcu users */ +	refcount_set(&tsk->usage, 1);  #ifdef CONFIG_BLK_DEV_IO_TRACE  	tsk->btrace_seq = 0;  #endif @@ -1007,7 +1023,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,  	mm_init_owner(mm, p);  	RCU_INIT_POINTER(mm->exe_file, NULL);  	mmu_notifier_mm_init(mm); -	hmm_mm_init(mm);  	init_tlb_flush_pending(mm);  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS  	mm->pmd_huge_pte = NULL; @@ -1517,28 +1532,17 @@ void __cleanup_sighand(struct sighand_struct *sighand)  	}  } -#ifdef CONFIG_POSIX_TIMERS  /*   * Initialize POSIX timer handling for a thread group.   */  static void posix_cpu_timers_init_group(struct signal_struct *sig)  { +	struct posix_cputimers *pct = &sig->posix_cputimers;  	unsigned long cpu_limit;  	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); -	if (cpu_limit != RLIM_INFINITY) { -		sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; -		sig->cputimer.running = true; -	} - -	/* The timer lists. */ -	INIT_LIST_HEAD(&sig->cpu_timers[0]); -	INIT_LIST_HEAD(&sig->cpu_timers[1]); -	INIT_LIST_HEAD(&sig->cpu_timers[2]); +	posix_cputimers_group_init(pct, cpu_limit);  } -#else -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } -#endif  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  { @@ -1640,23 +1644,6 @@ static void rt_mutex_init_task(struct task_struct *p)  #endif  } -#ifdef CONFIG_POSIX_TIMERS -/* - * Initialize POSIX timer handling for a single task. - */ -static void posix_cpu_timers_init(struct task_struct *tsk) -{ -	tsk->cputime_expires.prof_exp = 0; -	tsk->cputime_expires.virt_exp = 0; -	tsk->cputime_expires.sched_exp = 0; -	INIT_LIST_HEAD(&tsk->cpu_timers[0]); -	INIT_LIST_HEAD(&tsk->cpu_timers[1]); -	INIT_LIST_HEAD(&tsk->cpu_timers[2]); -} -#else -static inline void posix_cpu_timers_init(struct task_struct *tsk) { } -#endif -  static inline void init_task_pid_links(struct task_struct *task)  {  	enum pid_type type; @@ -1690,6 +1677,14 @@ static inline void rcu_copy_process(struct task_struct *p)  #endif /* #ifdef CONFIG_TASKS_RCU */  } +struct pid *pidfd_pid(const struct file *file) +{ +	if (file->f_op == &pidfd_fops) +		return file->private_data; + +	return ERR_PTR(-EBADF); +} +  static int pidfd_release(struct inode *inode, struct file *file)  {  	struct pid *pid = file->private_data; @@ -1935,7 +1930,7 @@ static __latent_entropy struct task_struct *copy_process(  	task_io_accounting_init(&p->ioac);  	acct_clear_integrals(p); -	posix_cpu_timers_init(p); +	posix_cputimers_init(&p->posix_cputimers);  	p->io_context = NULL;  	audit_set_context(p, NULL); @@ -2338,6 +2333,8 @@ struct mm_struct *copy_init_mm(void)   *   * It copies the process, and if successful kick-starts   * it and waits for it to finish using the VM if required. + * + * args->exit_signal is expected to be checked for sanity by the caller.   */  long _do_fork(struct kernel_clone_args *args)  { @@ -2562,6 +2559,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,  	if (copy_from_user(&args, uargs, size))  		return -EFAULT; +	/* +	 * Verify that higher 32bits of exit_signal are unset and that +	 * it is a valid signal +	 */ +	if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || +		     !valid_signal(args.exit_signal))) +		return -EINVAL; +  	*kargs = (struct kernel_clone_args){  		.flags		= args.flags,  		.pidfd		= u64_to_user_ptr(args.pidfd), diff --git a/kernel/futex.c b/kernel/futex.c index 6d50728ef2e7..bd18f60e4c6c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -487,11 +487,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,  	if (!time)  		return NULL; -	hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? -			      CLOCK_REALTIME : CLOCK_MONOTONIC, -			      HRTIMER_MODE_ABS); -	hrtimer_init_sleeper(timeout, current); - +	hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? +				      CLOCK_REALTIME : CLOCK_MONOTONIC, +				      HRTIMER_MODE_ABS);  	/*  	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is  	 * effectively the same as calling hrtimer_set_expires(). @@ -2613,7 +2611,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,  	/* Arm the timer */  	if (timeout) -		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); +		hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);  	/*  	 * If we have been removed from the hash list, then another task @@ -2899,7 +2897,7 @@ retry_private:  	}  	if (unlikely(to)) -		hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); +		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);  	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3941a9c48f83..060e8e726755 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling"  config GCOV_KERNEL  	bool "Enable gcov-based kernel profiling"  	depends on DEBUG_FS -	select CONSTRUCTORS if !UML +	select CONSTRUCTORS  	default n  	---help---  	This option enables gcov-based code profiling (e.g. for code coverage diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 6fef48033f96..4d89ad4fae3b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,6 +7,7 @@  #include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/cpu.h> +#include <linux/sort.h>  static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,  				unsigned int cpus_per_vec) @@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,  	return nodes;  } +struct node_vectors { +	unsigned id; + +	union { +		unsigned nvectors; +		unsigned ncpus; +	}; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ +	const struct node_vectors *ln = l; +	const struct node_vectors *rn = r; + +	return ln->ncpus - rn->ncpus; +} + +/* + * Allocate vector number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated numbver is <= active CPU number of this node + * + * The actual allocated total vectors may be less than @numvecs when + * active total CPU number is less than @numvecs. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_vectors(unsigned int numvecs, +				cpumask_var_t *node_to_cpumask, +				const struct cpumask *cpu_mask, +				const nodemask_t nodemsk, +				struct cpumask *nmsk, +				struct node_vectors *node_vectors) +{ +	unsigned n, remaining_ncpus = 0; + +	for (n = 0; n < nr_node_ids; n++) { +		node_vectors[n].id = n; +		node_vectors[n].ncpus = UINT_MAX; +	} + +	for_each_node_mask(n, nodemsk) { +		unsigned ncpus; + +		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); +		ncpus = cpumask_weight(nmsk); + +		if (!ncpus) +			continue; +		remaining_ncpus += ncpus; +		node_vectors[n].ncpus = ncpus; +	} + +	numvecs = min_t(unsigned, remaining_ncpus, numvecs); + +	sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), +	     ncpus_cmp_func, NULL); + +	/* +	 * Allocate vectors for each node according to the ratio of this +	 * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is +	 * bigger than number of active numa nodes. Always start the +	 * allocation from the node with minimized nr_cpus. +	 * +	 * This way guarantees that each active node gets allocated at +	 * least one vector, and the theory is simple: over-allocation +	 * is only done when this node is assigned by one vector, so +	 * other nodes will be allocated >= 1 vector, since 'numvecs' is +	 * bigger than number of numa nodes. +	 * +	 * One perfect invariant is that number of allocated vectors for +	 * each node is <= CPU count of this node: +	 * +	 * 1) suppose there are two nodes: A and B +	 * 	ncpu(X) is CPU count of node X +	 * 	vecs(X) is the vector count allocated to node X via this +	 * 	algorithm +	 * +	 * 	ncpu(A) <= ncpu(B) +	 * 	ncpu(A) + ncpu(B) = N +	 * 	vecs(A) + vecs(B) = V +	 * +	 * 	vecs(A) = max(1, round_down(V * ncpu(A) / N)) +	 * 	vecs(B) = V - vecs(A) +	 * +	 * 	both N and V are integer, and 2 <= V <= N, suppose +	 * 	V = N - delta, and 0 <= delta <= N - 2 +	 * +	 * 2) obviously vecs(A) <= ncpu(A) because: +	 * +	 * 	if vecs(A) is 1, then vecs(A) <= ncpu(A) given +	 * 	ncpu(A) >= 1 +	 * +	 * 	otherwise, +	 * 		vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N +	 * +	 * 3) prove how vecs(B) <= ncpu(B): +	 * +	 * 	if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be +	 * 	over-allocated, so vecs(B) <= ncpu(B), +	 * +	 * 	otherwise: +	 * +	 * 	vecs(A) = +	 * 		round_down(V * ncpu(A) / N) = +	 * 		round_down((N - delta) * ncpu(A) / N) = +	 * 		round_down((N * ncpu(A) - delta * ncpu(A)) / N)	 >= +	 * 		round_down((N * ncpu(A) - delta * N) / N)	 = +	 * 		cpu(A) - delta +	 * +	 * 	then: +	 * +	 * 	vecs(A) - V >= ncpu(A) - delta - V +	 * 	=> +	 * 	V - vecs(A) <= V + delta - ncpu(A) +	 * 	=> +	 * 	vecs(B) <= N - ncpu(A) +	 * 	=> +	 * 	vecs(B) <= cpu(B) +	 * +	 * For nodes >= 3, it can be thought as one node and another big +	 * node given that is exactly what this algorithm is implemented, +	 * and we always re-calculate 'remaining_ncpus' & 'numvecs', and +	 * finally for each node X: vecs(X) <= ncpu(X). +	 * +	 */ +	for (n = 0; n < nr_node_ids; n++) { +		unsigned nvectors, ncpus; + +		if (node_vectors[n].ncpus == UINT_MAX) +			continue; + +		WARN_ON_ONCE(numvecs == 0); + +		ncpus = node_vectors[n].ncpus; +		nvectors = max_t(unsigned, 1, +				 numvecs * ncpus / remaining_ncpus); +		WARN_ON_ONCE(nvectors > ncpus); + +		node_vectors[n].nvectors = nvectors; + +		remaining_ncpus -= ncpus; +		numvecs -= nvectors; +	} +} +  static int __irq_build_affinity_masks(unsigned int startvec,  				      unsigned int numvecs,  				      unsigned int firstvec, @@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,  				      struct cpumask *nmsk,  				      struct irq_affinity_desc *masks)  { -	unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; +	unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;  	unsigned int last_affv = firstvec + numvecs;  	unsigned int curvec = startvec;  	nodemask_t nodemsk = NODE_MASK_NONE; +	struct node_vectors *node_vectors;  	if (!cpumask_weight(cpu_mask))  		return 0; @@ -126,42 +277,56 @@ static int __irq_build_affinity_masks(unsigned int startvec,  		return numvecs;  	} -	for_each_node_mask(n, nodemsk) { -		unsigned int ncpus, v, vecs_to_assign, vecs_per_node; +	node_vectors = kcalloc(nr_node_ids, +			       sizeof(struct node_vectors), +			       GFP_KERNEL); +	if (!node_vectors) +		return -ENOMEM; -		/* Spread the vectors per node */ -		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; +	/* allocate vector number for each node */ +	alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, +			    nodemsk, nmsk, node_vectors); -		/* Get the cpus on this node which are in the mask */ -		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); +	for (i = 0; i < nr_node_ids; i++) { +		unsigned int ncpus, v; +		struct node_vectors *nv = &node_vectors[i]; + +		if (nv->nvectors == UINT_MAX) +			continue; -		/* Calculate the number of cpus per vector */ +		/* Get the cpus on this node which are in the mask */ +		cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);  		ncpus = cpumask_weight(nmsk); -		vecs_to_assign = min(vecs_per_node, ncpus); +		if (!ncpus) +			continue; + +		WARN_ON_ONCE(nv->nvectors > ncpus);  		/* Account for rounding errors */ -		extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); +		extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); -		for (v = 0; curvec < last_affv && v < vecs_to_assign; -		     curvec++, v++) { -			cpus_per_vec = ncpus / vecs_to_assign; +		/* Spread allocated vectors on CPUs of the current node */ +		for (v = 0; v < nv->nvectors; v++, curvec++) { +			cpus_per_vec = ncpus / nv->nvectors;  			/* Account for extra vectors to compensate rounding errors */  			if (extra_vecs) {  				cpus_per_vec++;  				--extra_vecs;  			} + +			/* +			 * wrapping has to be considered given 'startvec' +			 * may start anywhere +			 */ +			if (curvec >= last_affv) +				curvec = firstvec;  			irq_spread_init_one(&masks[curvec].mask, nmsk,  						cpus_per_vec);  		} - -		done += v; -		if (done >= numvecs) -			break; -		if (curvec >= last_affv) -			curvec = firstvec; -		--nodes; +		done += nv->nvectors;  	} +	kfree(node_vectors);  	return done;  } @@ -174,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,  				    unsigned int firstvec,  				    struct irq_affinity_desc *masks)  { -	unsigned int curvec = startvec, nr_present, nr_others; +	unsigned int curvec = startvec, nr_present = 0, nr_others = 0;  	cpumask_var_t *node_to_cpumask;  	cpumask_var_t nmsk, npresmsk;  	int ret = -ENOMEM; @@ -189,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,  	if (!node_to_cpumask)  		goto fail_npresmsk; -	ret = 0;  	/* Stabilize the cpumasks */  	get_online_cpus();  	build_node_to_cpumask(node_to_cpumask);  	/* Spread on present CPUs starting from affd->pre_vectors */ -	nr_present = __irq_build_affinity_masks(curvec, numvecs, -						firstvec, node_to_cpumask, -						cpu_present_mask, nmsk, masks); +	ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, +					 node_to_cpumask, cpu_present_mask, +					 nmsk, masks); +	if (ret < 0) +		goto fail_build_affinity; +	nr_present = ret;  	/*  	 * Spread on non present CPUs starting from the next vector to be @@ -210,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,  	else  		curvec = firstvec + nr_present;  	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); -	nr_others = __irq_build_affinity_masks(curvec, numvecs, -					       firstvec, node_to_cpumask, -					       npresmsk, nmsk, masks); +	ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, +					 node_to_cpumask, npresmsk, nmsk, +					 masks); +	if (ret >= 0) +		nr_others = ret; + + fail_build_affinity:  	put_online_cpus(); -	if (nr_present < numvecs) +	if (ret >= 0)  		WARN_ON(nr_present + nr_others < numvecs);  	free_node_to_cpumask(node_to_cpumask); @@ -225,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,   fail_nmsk:  	free_cpumask_var(nmsk); -	return ret; +	return ret < 0 ? ret : 0;  }  static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3078d0e48bba..132672b74e4b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -31,7 +31,7 @@ struct irqchip_fwid {  	struct fwnode_handle	fwnode;  	unsigned int		type;  	char			*name; -	void *data; +	phys_addr_t		*pa;  };  #ifdef CONFIG_GENERIC_IRQ_DEBUGFS @@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);   * domain struct.   */  struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, -						const char *name, void *data) +						const char *name, +						phys_addr_t *pa)  {  	struct irqchip_fwid *fwid;  	char *n; @@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,  		n = kasprintf(GFP_KERNEL, "%s-%d", name, id);  		break;  	default: -		n = kasprintf(GFP_KERNEL, "irqchip@%p", data); +		n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa);  		break;  	} @@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,  	fwid->type = type;  	fwid->name = n; -	fwid->data = data; +	fwid->pa = pa;  	fwid->fwnode.ops = &irqchip_fwnode_ops;  	return &fwid->fwnode;  } @@ -148,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,  		switch (fwid->type) {  		case IRQCHIP_FWNODE_NAMED:  		case IRQCHIP_FWNODE_NAMED_ID: +			domain->fwnode = fwnode;  			domain->name = kstrdup(fwid->name, GFP_KERNEL);  			if (!domain->name) {  				kfree(domain); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..1753486b440c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -23,7 +23,7 @@  #include "internals.h" -#ifdef CONFIG_IRQ_FORCED_THREADING +#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)  __read_mostly bool force_irqthreads;  EXPORT_SYMBOL_GPL(force_irqthreads); @@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)  	 * the thread dies to avoid that the interrupt code  	 * references an already freed task_struct.  	 */ -	get_task_struct(t); -	new->thread = t; +	new->thread = get_task_struct(t);  	/*  	 * Tell the thread to set its affinity. This is  	 * important for shared interrupt handlers as we do diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d6961d3c6f9e..8f557fa1f4fe 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -177,6 +177,26 @@ static void resume_irqs(bool want_early)  }  /** + * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup + * @irq: Interrupt to rearm + */ +void rearm_wake_irq(unsigned int irq) +{ +	unsigned long flags; +	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + +	if (!desc || !(desc->istate & IRQS_SUSPENDED) || +	    !irqd_is_wakeup_set(&desc->irq_data)) +		return; + +	desc->istate &= ~IRQS_SUSPENDED; +	irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); +	__enable_irq(desc); + +	irq_put_desc_busunlock(desc, flags); +} + +/**   * irq_pm_syscore_ops - enable interrupt lines early   *   * Enable all interrupt lines with %IRQF_EARLY_RESUME set. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index da9addb8d655..cfc4f088a0e7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)  	return 0;  } -#ifndef is_affinity_mask_valid -#define is_affinity_mask_valid(val) 1 -#endif -  int no_irq_affinity;  static int irq_affinity_proc_show(struct seq_file *m, void *v)  { @@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file,  	if (err)  		goto free_cpumask; -	if (!is_affinity_mask_valid(new_value)) { -		err = -EINVAL; -		goto free_cpumask; -	} -  	/*  	 * Do not allow disabling IRQs completely - it's a too easy  	 * way to make the system unusable accidentally :-) At least @@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file,  	if (err)  		goto out; -	if (!is_affinity_mask_valid(new_value)) { -		err = -EINVAL; -		goto out; -	} -  	/*  	 * Do not allow disabling IRQs completely - it's a too easy  	 * way to make the system unusable accidentally :-) At least diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 95414ad3506a..98c04ca5fa43 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg)  		irq = find_first_bit(irqs_resend, nr_irqs);  		clear_bit(irq, irqs_resend);  		desc = irq_to_desc(irq); +		if (!desc) +			continue;  		local_irq_disable();  		desc->handle_irq(desc);  		local_irq_enable(); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index df3008419a1d..cdb3ffab128b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)  		return false;  	if (!kernel_text_address(jump_entry_code(entry))) { -		WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); +		WARN_ONCE(!jump_entry_is_init(entry), +			  "can't patch jump_label at %pS", +			  (void *)jump_entry_code(entry));  		return false;  	} diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b018f1a6e0d..bc933c0db9bf 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -206,6 +206,14 @@ static inline int kexec_load_check(unsigned long nr_segments,  		return result;  	/* +	 * kexec can be used to circumvent module loading restrictions, so +	 * prevent loading in that case +	 */ +	result = security_locked_down(LOCKDOWN_KEXEC); +	if (result) +		return result; + +	/*  	 * Verify we have a legal set of flags  	 * This leaves us room for future extensions.  	 */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5870723b8ad..15d70a90b50d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)  {  	struct page *pages; +	if (fatal_signal_pending(current)) +		return NULL;  	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);  	if (pages) {  		unsigned int count, i; diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c new file mode 100644 index 000000000000..d3689632e8b9 --- /dev/null +++ b/kernel/kexec_elf.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Load ELF vmlinux file for the kexec_file_load syscall. + * + * Copyright (C) 2004  Adam Litke (agl@us.ibm.com) + * Copyright (C) 2004  IBM Corp. + * Copyright (C) 2005  R Sharada (sharada@in.ibm.com) + * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com) + * Copyright (C) 2016  IBM Corporation + * + * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c. + * Heavily modified for the kernel by + * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>. + */ + +#define pr_fmt(fmt)	"kexec_elf: " fmt + +#include <linux/elf.h> +#include <linux/kexec.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> + +static inline bool elf_is_elf_file(const struct elfhdr *ehdr) +{ +	return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; +} + +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) +{ +	if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) +		value = le64_to_cpu(value); +	else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) +		value = be64_to_cpu(value); + +	return value; +} + +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) +{ +	if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) +		value = le32_to_cpu(value); +	else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) +		value = be32_to_cpu(value); + +	return value; +} + +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) +{ +	if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) +		value = le16_to_cpu(value); +	else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) +		value = be16_to_cpu(value); + +	return value; +} + +/** + * elf_is_ehdr_sane - check that it is safe to use the ELF header + * @buf_len:	size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len) +{ +	if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) { +		pr_debug("Bad program header size.\n"); +		return false; +	} else if (ehdr->e_shnum > 0 && +		   ehdr->e_shentsize != sizeof(struct elf_shdr)) { +		pr_debug("Bad section header size.\n"); +		return false; +	} else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT || +		   ehdr->e_version != EV_CURRENT) { +		pr_debug("Unknown ELF version.\n"); +		return false; +	} + +	if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { +		size_t phdr_size; + +		/* +		 * e_phnum is at most 65535 so calculating the size of the +		 * program header cannot overflow. +		 */ +		phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; + +		/* Sanity check the program header table location. */ +		if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) { +			pr_debug("Program headers at invalid location.\n"); +			return false; +		} else if (ehdr->e_phoff + phdr_size > buf_len) { +			pr_debug("Program headers truncated.\n"); +			return false; +		} +	} + +	if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { +		size_t shdr_size; + +		/* +		 * e_shnum is at most 65536 so calculating +		 * the size of the section header cannot overflow. +		 */ +		shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum; + +		/* Sanity check the section header table location. */ +		if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) { +			pr_debug("Section headers at invalid location.\n"); +			return false; +		} else if (ehdr->e_shoff + shdr_size > buf_len) { +			pr_debug("Section headers truncated.\n"); +			return false; +		} +	} + +	return true; +} + +static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) +{ +	struct elfhdr *buf_ehdr; + +	if (len < sizeof(*buf_ehdr)) { +		pr_debug("Buffer is too small to hold ELF header.\n"); +		return -ENOEXEC; +	} + +	memset(ehdr, 0, sizeof(*ehdr)); +	memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident)); +	if (!elf_is_elf_file(ehdr)) { +		pr_debug("No ELF header magic.\n"); +		return -ENOEXEC; +	} + +	if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) { +		pr_debug("Not a supported ELF class.\n"); +		return -ENOEXEC; +	} else  if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB && +		ehdr->e_ident[EI_DATA] != ELFDATA2MSB) { +		pr_debug("Not a supported ELF data format.\n"); +		return -ENOEXEC; +	} + +	buf_ehdr = (struct elfhdr *) buf; +	if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) { +		pr_debug("Bad ELF header size.\n"); +		return -ENOEXEC; +	} + +	ehdr->e_type      = elf16_to_cpu(ehdr, buf_ehdr->e_type); +	ehdr->e_machine   = elf16_to_cpu(ehdr, buf_ehdr->e_machine); +	ehdr->e_version   = elf32_to_cpu(ehdr, buf_ehdr->e_version); +	ehdr->e_flags     = elf32_to_cpu(ehdr, buf_ehdr->e_flags); +	ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize); +	ehdr->e_phnum     = elf16_to_cpu(ehdr, buf_ehdr->e_phnum); +	ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize); +	ehdr->e_shnum     = elf16_to_cpu(ehdr, buf_ehdr->e_shnum); +	ehdr->e_shstrndx  = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx); + +	switch (ehdr->e_ident[EI_CLASS]) { +	case ELFCLASS64: +		ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry); +		ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff); +		ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff); +		break; + +	case ELFCLASS32: +		ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry); +		ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff); +		ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff); +		break; + +	default: +		pr_debug("Unknown ELF class.\n"); +		return -EINVAL; +	} + +	return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_is_phdr_sane - check that it is safe to use the program header + * @buf_len:	size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len) +{ + +	if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) { +		pr_debug("ELF segment location wraps around.\n"); +		return false; +	} else if (phdr->p_offset + phdr->p_filesz > buf_len) { +		pr_debug("ELF segment not in file.\n"); +		return false; +	} else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) { +		pr_debug("ELF segment address wraps around.\n"); +		return false; +	} + +	return true; +} + +static int elf_read_phdr(const char *buf, size_t len, +			 struct kexec_elf_info *elf_info, +			 int idx) +{ +	/* Override the const in proghdrs, we are the ones doing the loading. */ +	struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx]; +	const struct elfhdr *ehdr = elf_info->ehdr; +	const char *pbuf; +	struct elf_phdr *buf_phdr; + +	pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr)); +	buf_phdr = (struct elf_phdr *) pbuf; + +	phdr->p_type   = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type); +	phdr->p_flags  = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags); + +	switch (ehdr->e_ident[EI_CLASS]) { +	case ELFCLASS64: +		phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset); +		phdr->p_paddr  = elf64_to_cpu(ehdr, buf_phdr->p_paddr); +		phdr->p_vaddr  = elf64_to_cpu(ehdr, buf_phdr->p_vaddr); +		phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz); +		phdr->p_memsz  = elf64_to_cpu(ehdr, buf_phdr->p_memsz); +		phdr->p_align  = elf64_to_cpu(ehdr, buf_phdr->p_align); +		break; + +	case ELFCLASS32: +		phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset); +		phdr->p_paddr  = elf32_to_cpu(ehdr, buf_phdr->p_paddr); +		phdr->p_vaddr  = elf32_to_cpu(ehdr, buf_phdr->p_vaddr); +		phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz); +		phdr->p_memsz  = elf32_to_cpu(ehdr, buf_phdr->p_memsz); +		phdr->p_align  = elf32_to_cpu(ehdr, buf_phdr->p_align); +		break; + +	default: +		pr_debug("Unknown ELF class.\n"); +		return -EINVAL; +	} + +	return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_read_phdrs - read the program headers from the buffer + * + * This function assumes that the program header table was checked for sanity. + * Use elf_is_ehdr_sane() if it wasn't. + */ +static int elf_read_phdrs(const char *buf, size_t len, +			  struct kexec_elf_info *elf_info) +{ +	size_t phdr_size, i; +	const struct elfhdr *ehdr = elf_info->ehdr; + +	/* +	 * e_phnum is at most 65535 so calculating the size of the +	 * program header cannot overflow. +	 */ +	phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; + +	elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL); +	if (!elf_info->proghdrs) +		return -ENOMEM; + +	for (i = 0; i < ehdr->e_phnum; i++) { +		int ret; + +		ret = elf_read_phdr(buf, len, elf_info, i); +		if (ret) { +			kfree(elf_info->proghdrs); +			elf_info->proghdrs = NULL; +			return ret; +		} +	} + +	return 0; +} + +/** + * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info + * @buf:	Buffer to read ELF file from. + * @len:	Size of @buf. + * @ehdr:	Pointer to existing struct which will be populated. + * @elf_info:	Pointer to existing struct which will be populated. + * + * This function allows reading ELF files with different byte order than + * the kernel, byte-swapping the fields as needed. + * + * Return: + * On success returns 0, and the caller should call + * kexec_free_elf_info(elf_info) to free the memory allocated for the section + * and program headers. + */ +static int elf_read_from_buffer(const char *buf, size_t len, +				struct elfhdr *ehdr, +				struct kexec_elf_info *elf_info) +{ +	int ret; + +	ret = elf_read_ehdr(buf, len, ehdr); +	if (ret) +		return ret; + +	elf_info->buffer = buf; +	elf_info->ehdr = ehdr; +	if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { +		ret = elf_read_phdrs(buf, len, elf_info); +		if (ret) +			return ret; +	} +	return 0; +} + +/** + * kexec_free_elf_info - free memory allocated by elf_read_from_buffer + */ +void kexec_free_elf_info(struct kexec_elf_info *elf_info) +{ +	kfree(elf_info->proghdrs); +	memset(elf_info, 0, sizeof(*elf_info)); +} +/** + * kexec_build_elf_info - read ELF executable and check that we can use it + */ +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, +			       struct kexec_elf_info *elf_info) +{ +	int i; +	int ret; + +	ret = elf_read_from_buffer(buf, len, ehdr, elf_info); +	if (ret) +		return ret; + +	/* Big endian vmlinux has type ET_DYN. */ +	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) { +		pr_err("Not an ELF executable.\n"); +		goto error; +	} else if (!elf_info->proghdrs) { +		pr_err("No ELF program header.\n"); +		goto error; +	} + +	for (i = 0; i < ehdr->e_phnum; i++) { +		/* +		 * Kexec does not support loading interpreters. +		 * In addition this check keeps us from attempting +		 * to kexec ordinay executables. +		 */ +		if (elf_info->proghdrs[i].p_type == PT_INTERP) { +			pr_err("Requires an ELF interpreter.\n"); +			goto error; +		} +	} + +	return 0; +error: +	kexec_free_elf_info(elf_info); +	return -ENOEXEC; +} + + +int kexec_elf_probe(const char *buf, unsigned long len) +{ +	struct elfhdr ehdr; +	struct kexec_elf_info elf_info; +	int ret; + +	ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info); +	if (ret) +		return ret; + +	kexec_free_elf_info(&elf_info); + +	return elf_check_arch(&ehdr) ? 0 : -ENOEXEC; +} + +/** + * kexec_elf_load - load ELF executable image + * @lowest_load_addr:	On return, will be the address where the first PT_LOAD + *			section will be loaded in memory. + * + * Return: + * 0 on success, negative value on failure. + */ +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, +			 struct kexec_elf_info *elf_info, +			 struct kexec_buf *kbuf, +			 unsigned long *lowest_load_addr) +{ +	unsigned long lowest_addr = UINT_MAX; +	int ret; +	size_t i; + +	/* Read in the PT_LOAD segments. */ +	for (i = 0; i < ehdr->e_phnum; i++) { +		unsigned long load_addr; +		size_t size; +		const struct elf_phdr *phdr; + +		phdr = &elf_info->proghdrs[i]; +		if (phdr->p_type != PT_LOAD) +			continue; + +		size = phdr->p_filesz; +		if (size > phdr->p_memsz) +			size = phdr->p_memsz; + +		kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset; +		kbuf->bufsz = size; +		kbuf->memsz = phdr->p_memsz; +		kbuf->buf_align = phdr->p_align; +		kbuf->buf_min = phdr->p_paddr; +		kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; +		ret = kexec_add_buffer(kbuf); +		if (ret) +			goto out; +		load_addr = kbuf->mem; + +		if (load_addr < lowest_addr) +			lowest_addr = load_addr; +	} + +	*lowest_load_addr = lowest_addr; +	ret = 0; + out: +	return ret; +} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b8cc032d5620..79f252af7dee 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)  	return kexec_image_post_load_cleanup_default(image);  } -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG  static int kexec_image_verify_sig_default(struct kimage *image, void *buf,  					  unsigned long buf_len)  { @@ -177,6 +177,59 @@ void kimage_file_post_load_cleanup(struct kimage *image)  	image->image_loader_data = NULL;  } +#ifdef CONFIG_KEXEC_SIG +static int +kimage_validate_signature(struct kimage *image) +{ +	const char *reason; +	int ret; + +	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, +					   image->kernel_buf_len); +	switch (ret) { +	case 0: +		break; + +		/* Certain verification errors are non-fatal if we're not +		 * checking errors, provided we aren't mandating that there +		 * must be a valid signature. +		 */ +	case -ENODATA: +		reason = "kexec of unsigned image"; +		goto decide; +	case -ENOPKG: +		reason = "kexec of image with unsupported crypto"; +		goto decide; +	case -ENOKEY: +		reason = "kexec of image with unavailable key"; +	decide: +		if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { +			pr_notice("%s rejected\n", reason); +			return ret; +		} + +		/* If IMA is guaranteed to appraise a signature on the kexec +		 * image, permit it even if the kernel is otherwise locked +		 * down. +		 */ +		if (!ima_appraise_signature(READING_KEXEC_IMAGE) && +		    security_locked_down(LOCKDOWN_KEXEC)) +			return -EPERM; + +		return 0; + +		/* All other errors are fatal, including nomem, unparseable +		 * signatures and signature check failures - even if signatures +		 * aren't required. +		 */ +	default: +		pr_notice("kernel signature verification failed (%d).\n", ret); +	} + +	return ret; +} +#endif +  /*   * In file mode list of segments is prepared by kernel. Copy relevant   * data from user space, do error checking, prepare segment list @@ -186,7 +239,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,  			     const char __user *cmdline_ptr,  			     unsigned long cmdline_len, unsigned flags)  { -	int ret = 0; +	int ret;  	void *ldata;  	loff_t size; @@ -202,14 +255,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,  	if (ret)  		goto out; -#ifdef CONFIG_KEXEC_VERIFY_SIG -	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, -					   image->kernel_buf_len); -	if (ret) { -		pr_debug("kernel signature verification failed.\n"); +#ifdef CONFIG_KEXEC_SIG +	ret = kimage_validate_signature(image); + +	if (ret)  		goto out; -	} -	pr_debug("kernel signature verification successful.\n");  #endif  	/* It is possible that there no initramfs is being loaded */  	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d9770a5393c8..53534aa258a6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -962,8 +962,15 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)  #ifdef CONFIG_KPROBES_ON_FTRACE  static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {  	.func = kprobe_ftrace_handler, +	.flags = FTRACE_OPS_FL_SAVE_REGS, +}; + +static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { +	.func = kprobe_ftrace_handler,  	.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,  }; + +static int kprobe_ipmodify_enabled;  static int kprobe_ftrace_enabled;  /* Must ensure p->addr is really on ftrace */ @@ -976,58 +983,75 @@ static int prepare_kprobe(struct kprobe *p)  }  /* Caller must lock kprobe_mutex */ -static int arm_kprobe_ftrace(struct kprobe *p) +static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, +			       int *cnt)  {  	int ret = 0; -	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, -				   (unsigned long)p->addr, 0, 0); +	ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);  	if (ret) {  		pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",  			 p->addr, ret);  		return ret;  	} -	if (kprobe_ftrace_enabled == 0) { -		ret = register_ftrace_function(&kprobe_ftrace_ops); +	if (*cnt == 0) { +		ret = register_ftrace_function(ops);  		if (ret) {  			pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);  			goto err_ftrace;  		}  	} -	kprobe_ftrace_enabled++; +	(*cnt)++;  	return ret;  err_ftrace:  	/* -	 * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a -	 * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental -	 * empty filter_hash which would undesirably trace all functions. +	 * At this point, sinec ops is not registered, we should be sefe from +	 * registering empty filter.  	 */ -	ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); +	ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);  	return ret;  } +static int arm_kprobe_ftrace(struct kprobe *p) +{ +	bool ipmodify = (p->post_handler != NULL); + +	return __arm_kprobe_ftrace(p, +		ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, +		ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); +} +  /* Caller must lock kprobe_mutex */ -static int disarm_kprobe_ftrace(struct kprobe *p) +static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, +				  int *cnt)  {  	int ret = 0; -	if (kprobe_ftrace_enabled == 1) { -		ret = unregister_ftrace_function(&kprobe_ftrace_ops); +	if (*cnt == 1) { +		ret = unregister_ftrace_function(ops);  		if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))  			return ret;  	} -	kprobe_ftrace_enabled--; +	(*cnt)--; -	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, -			   (unsigned long)p->addr, 1, 0); +	ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);  	WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",  		  p->addr, ret);  	return ret;  } + +static int disarm_kprobe_ftrace(struct kprobe *p) +{ +	bool ipmodify = (p->post_handler != NULL); + +	return __disarm_kprobe_ftrace(p, +		ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, +		ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); +}  #else	/* !CONFIG_KPROBES_ON_FTRACE */  #define prepare_kprobe(p)	arch_prepare_kprobe(p)  #define arm_kprobe_ftrace(p)	(-ENODEV) @@ -1514,7 +1538,8 @@ static int check_kprobe_address_safe(struct kprobe *p,  	/* Ensure it is not in reserved area nor out of text */  	if (!kernel_text_address((unsigned long) p->addr) ||  	    within_kprobe_blacklist((unsigned long) p->addr) || -	    jump_label_text_reserved(p->addr, p->addr)) { +	    jump_label_text_reserved(p->addr, p->addr) || +	    find_bug((unsigned long)p->addr)) {  		ret = -EINVAL;  		goto out;  	} @@ -1906,7 +1931,7 @@ int register_kretprobe(struct kretprobe *rp)  	/* Pre-allocate memory for max kretprobe instances */  	if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION  		rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());  #else  		rp->maxactive = num_possible_cpus(); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c4ce08f43bd6..ab4a4606d19b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1175,6 +1175,7 @@ err:  	pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",  		patch->mod->name, obj->mod->name, obj->mod->name);  	mod->klp_alive = false; +	obj->mod = NULL;  	klp_cleanup_module_patches_limited(mod, patch);  	mutex_unlock(&klp_mutex); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..233459c03b5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -449,33 +449,101 @@ static void print_lockdep_off(const char *bug_msg)  unsigned long nr_stack_trace_entries;  #ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry:	Entry in a stack_trace_hash[] list. + * @hash:	jhash() of @entries. + * @nr_entries:	Number of entries in @entries. + * @entries:	Actual stack backtrace. + */ +struct lock_trace { +	struct hlist_node	hash_entry; +	u32			hash; +	u32			nr_entries; +	unsigned long		entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS				\ +	(sizeof(struct lock_trace) / sizeof(unsigned long))  /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock.   */  static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; + +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ +	return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && +		memcmp(t1->entries, t2->entries, +		       t1->nr_entries * sizeof(t1->entries[0])) == 0; +} -static int save_trace(struct lock_trace *trace) +static struct lock_trace *save_trace(void)  { -	unsigned long *entries = stack_trace + nr_stack_trace_entries; +	struct lock_trace *trace, *t2; +	struct hlist_head *hash_head; +	u32 hash;  	unsigned int max_entries; -	trace->offset = nr_stack_trace_entries; -	max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; -	trace->nr_entries = stack_trace_save(entries, max_entries, 3); -	nr_stack_trace_entries += trace->nr_entries; +	BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); +	BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); + +	trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); +	max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - +		LOCK_TRACE_SIZE_IN_LONGS; +	trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); -	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { +	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - +	    LOCK_TRACE_SIZE_IN_LONGS - 1) {  		if (!debug_locks_off_graph_unlock()) -			return 0; +			return NULL;  		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");  		dump_stack(); -		return 0; +		return NULL;  	} -	return 1; +	hash = jhash(trace->entries, trace->nr_entries * +		     sizeof(trace->entries[0]), 0); +	trace->hash = hash; +	hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); +	hlist_for_each_entry(t2, hash_head, hash_entry) { +		if (traces_identical(trace, t2)) +			return t2; +	} +	nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; +	hlist_add_head(&trace->hash_entry, hash_head); + +	return trace; +} + +/* Return the number of stack traces in the stack_trace[] array. */ +u64 lockdep_stack_trace_count(void) +{ +	struct lock_trace *trace; +	u64 c = 0; +	int i; + +	for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) { +		hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) { +			c++; +		} +	} + +	return c; +} + +/* Return the number of stack hash chains that have at least one stack trace. */ +u64 lockdep_stack_hash_count(void) +{ +	u64 c = 0; +	int i; + +	for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) +		if (!hlist_empty(&stack_trace_hash[i])) +			c++; + +	return c;  }  #endif @@ -511,7 +579,7 @@ static const char *usage_str[] =  };  #endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str)  {  	return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);  } @@ -620,7 +688,7 @@ static void print_lock(struct held_lock *hlock)  		return;  	} -	printk(KERN_CONT "%p", hlock->instance); +	printk(KERN_CONT "%px", hlock->instance);  	print_lock_name(lock);  	printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);  } @@ -1235,7 +1303,7 @@ static struct lock_list *alloc_list_entry(void)  static int add_lock_to_list(struct lock_class *this,  			    struct lock_class *links_to, struct list_head *head,  			    unsigned long ip, int distance, -			    struct lock_trace *trace) +			    const struct lock_trace *trace)  {  	struct lock_list *entry;  	/* @@ -1249,7 +1317,7 @@ static int add_lock_to_list(struct lock_class *this,  	entry->class = this;  	entry->links_to = links_to;  	entry->distance = distance; -	entry->trace = *trace; +	entry->trace = trace;  	/*  	 * Both allocation and removal are done under the graph lock; but  	 * iteration is under RCU-sched; see look_up_lock_class() and @@ -1470,11 +1538,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry,  } -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +static void print_lock_trace(const struct lock_trace *trace, +			     unsigned int spaces)  { -	unsigned long *entries = stack_trace + trace->offset; - -	stack_trace_print(entries, trace->nr_entries, spaces); +	stack_trace_print(trace->entries, trace->nr_entries, spaces);  }  /* @@ -1489,7 +1556,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)  	printk("\n-> #%u", depth);  	print_lock_name(target->class);  	printk(KERN_CONT ":\n"); -	print_lock_trace(&target->trace, 6); +	print_lock_trace(target->trace, 6);  }  static void @@ -1592,7 +1659,8 @@ static noinline void print_circular_bug(struct lock_list *this,  	if (!debug_locks_off_graph_unlock() || debug_locks_silent)  		return; -	if (!save_trace(&this->trace)) +	this->trace = save_trace(); +	if (!this->trace)  		return;  	depth = get_lock_depth(target); @@ -1715,7 +1783,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry,   */  static noinline int  check_noncircular(struct held_lock *src, struct held_lock *target, -		  struct lock_trace *trace) +		  struct lock_trace **const trace)  {  	int ret;  	struct lock_list *uninitialized_var(target_entry); @@ -1729,13 +1797,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target,  	ret = check_path(hlock_class(target), &src_entry, &target_entry);  	if (unlikely(!ret)) { -		if (!trace->nr_entries) { +		if (!*trace) {  			/*  			 * If save_trace fails here, the printing might  			 * trigger a WARN but because of the !nr_entries it  			 * should not do bad things.  			 */ -			save_trace(trace); +			*trace = save_trace();  		}  		print_circular_bug(&src_entry, target_entry, src, target); @@ -1859,7 +1927,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)  			len += printk("%*s   %s", depth, "", usage_str[bit]);  			len += printk(KERN_CONT " at:\n"); -			print_lock_trace(class->usage_traces + bit, len); +			print_lock_trace(class->usage_traces[bit], len);  		}  	}  	printk("%*s }\n", depth, ""); @@ -1884,7 +1952,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,  	do {  		print_lock_class_header(entry->class, depth);  		printk("%*s ... acquired at:\n", depth, ""); -		print_lock_trace(&entry->trace, 2); +		print_lock_trace(entry->trace, 2);  		printk("\n");  		if (depth == 0 && (entry != root)) { @@ -1995,14 +2063,14 @@ print_bad_irq_dependency(struct task_struct *curr,  	print_lock_name(backwards_entry->class);  	pr_warn("\n... which became %s-irq-safe at:\n", irqclass); -	print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); +	print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);  	pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);  	print_lock_name(forwards_entry->class);  	pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);  	pr_warn("..."); -	print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); +	print_lock_trace(forwards_entry->class->usage_traces[bit2], 1);  	pr_warn("\nother info that might help us debug this:\n\n");  	print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2011,13 +2079,15 @@ print_bad_irq_dependency(struct task_struct *curr,  	lockdep_print_held_locks(curr);  	pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); -	if (!save_trace(&prev_root->trace)) +	prev_root->trace = save_trace(); +	if (!prev_root->trace)  		return;  	print_shortest_lock_dependencies(backwards_entry, prev_root);  	pr_warn("\nthe dependencies between the lock to be acquired");  	pr_warn(" and %s-irq-unsafe lock:\n", irqclass); -	if (!save_trace(&next_root->trace)) +	next_root->trace = save_trace(); +	if (!next_root->trace)  		return;  	print_shortest_lock_dependencies(forwards_entry, next_root); @@ -2369,7 +2439,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)   */  static int  check_prev_add(struct task_struct *curr, struct held_lock *prev, -	       struct held_lock *next, int distance, struct lock_trace *trace) +	       struct held_lock *next, int distance, +	       struct lock_trace **const trace)  {  	struct lock_list *entry;  	int ret; @@ -2444,8 +2515,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		return ret;  #endif -	if (!trace->nr_entries && !save_trace(trace)) -		return 0; +	if (!*trace) { +		*trace = save_trace(); +		if (!*trace) +			return 0; +	}  	/*  	 * Ok, all validations passed, add the new lock @@ -2453,14 +2527,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	 */  	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),  			       &hlock_class(prev)->locks_after, -			       next->acquire_ip, distance, trace); +			       next->acquire_ip, distance, *trace);  	if (!ret)  		return 0;  	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),  			       &hlock_class(next)->locks_before, -			       next->acquire_ip, distance, trace); +			       next->acquire_ip, distance, *trace);  	if (!ret)  		return 0; @@ -2476,7 +2550,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  static int  check_prevs_add(struct task_struct *curr, struct held_lock *next)  { -	struct lock_trace trace = { .nr_entries = 0 }; +	struct lock_trace *trace = NULL;  	int depth = curr->lockdep_depth;  	struct held_lock *hlock; @@ -3015,7 +3089,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,  	print_lock(this);  	pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); -	print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); +	print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1);  	print_irqtrace_events(curr);  	pr_warn("\nother info that might help us debug this:\n"); @@ -3096,7 +3170,8 @@ print_irq_inversion_bug(struct task_struct *curr,  	lockdep_print_held_locks(curr);  	pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); -	if (!save_trace(&root->trace)) +	root->trace = save_trace(); +	if (!root->trace)  		return;  	print_shortest_lock_dependencies(other, root); @@ -3580,7 +3655,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,  	hlock_class(this)->usage_mask |= new_mask; -	if (!save_trace(hlock_class(this)->usage_traces + new_bit)) +	if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))  		return 0;  	switch (new_bit) { @@ -5157,6 +5232,12 @@ void __init lockdep_init(void)  		) / 1024  		); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +	printk(" memory used for stack traces: %zu kB\n", +	       (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 +	       ); +#endif +  	printk(" per task-struct memory footprint: %zu bytes\n",  	       sizeof(((struct task_struct *)NULL)->held_locks));  } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index cc83568d5012..18d85aebbb57 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =  #define MAX_LOCKDEP_ENTRIES	16384UL  #define MAX_LOCKDEP_CHAINS_BITS	15  #define MAX_STACK_TRACE_ENTRIES	262144UL +#define STACK_TRACE_HASH_SIZE	8192  #else  #define MAX_LOCKDEP_ENTRIES	32768UL @@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =   * addresses. Protected by the hash_lock.   */  #define MAX_STACK_TRACE_ENTRIES	524288UL +#define STACK_TRACE_HASH_SIZE	16384  #endif  #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS) @@ -116,7 +118,8 @@ extern struct lock_chain lock_chains[];  extern void get_usage_chars(struct lock_class *class,  			    char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, +				  char *str);  struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); @@ -137,6 +140,10 @@ extern unsigned int max_bfs_queue_depth;  #ifdef CONFIG_PROVE_LOCKING  extern unsigned long lockdep_count_forward_deps(struct lock_class *);  extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#ifdef CONFIG_TRACE_IRQFLAGS +u64 lockdep_stack_trace_count(void); +u64 lockdep_stack_hash_count(void); +#endif  #else  static inline unsigned long  lockdep_count_forward_deps(struct lock_class *class) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index bda006f8a88b..dadb7b7fba37 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -285,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)  			nr_process_chains);  	seq_printf(m, " stack-trace entries:           %11lu [max: %lu]\n",  			nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +	seq_printf(m, " number of stack traces:        %llu\n", +		   lockdep_stack_trace_count()); +	seq_printf(m, " number of stack hash chains:   %llu\n", +		   lockdep_stack_hash_count()); +#endif  	seq_printf(m, " combined max dependencies:     %11u\n",  			(nr_hardirq_chains + 1) *  			(nr_softirq_chains + 1) * @@ -399,7 +405,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)  static void seq_stats(struct seq_file *m, struct lock_stat_data *data)  { -	struct lockdep_subclass_key *ckey; +	const struct lockdep_subclass_key *ckey;  	struct lock_class_stats *stats;  	struct lock_class *class;  	const char *cname; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5e069734363c..468a9b8422e3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -65,11 +65,37 @@ EXPORT_SYMBOL(__mutex_init);  #define MUTEX_FLAGS		0x07 +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ +	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); +} +  static inline struct task_struct *__owner_task(unsigned long owner)  {  	return (struct task_struct *)(owner & ~MUTEX_FLAGS);  } +bool mutex_is_locked(struct mutex *lock) +{ +	return __mutex_owner(lock) != NULL; +} +EXPORT_SYMBOL(mutex_is_locked); + +__must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock) +{ +	if (unlikely(__mutex_owner(lock) == current)) +		return MUTEX_TRYLOCK_RECURSIVE; + +	return mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock_recursive); +  static inline unsigned long __owner_flags(unsigned long owner)  {  	return owner & MUTEX_FLAGS; diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 89bab079e7a4..e84d21aa0722 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop)  	if ((loop & PV_PREV_CHECK_MASK) != 0)  		return false; -	return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); +	return READ_ONCE(prev->state) != vcpu_running;  }  /* diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fa83d36e30c6..2874bf556162 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  		}  		/* [10] Grab the next task, i.e. owner of @lock */ -		task = rt_mutex_owner(lock); -		get_task_struct(task); +		task = get_task_struct(rt_mutex_owner(lock));  		raw_spin_lock(&task->pi_lock);  		/* @@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  	}  	/* [10] Grab the next task, i.e. the owner of @lock */ -	task = rt_mutex_owner(lock); -	get_task_struct(task); +	task = get_task_struct(rt_mutex_owner(lock));  	raw_spin_lock(&task->pi_lock);  	/* [11] requeue the pi waiters if necessary */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bd0f0d05724c..eef04551eae7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,8 +105,9 @@  #ifdef CONFIG_DEBUG_RWSEMS  # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\  	if (!debug_locks_silent &&				\ -	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ +	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\  		#c, atomic_long_read(&(sem)->count),		\ +		(unsigned long) sem->magic,			\  		atomic_long_read(&(sem)->owner), (long)current,	\  		list_empty(&(sem)->wait_list) ? "" : "not "))	\  			debug_locks_off();			\ @@ -330,6 +331,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,  	debug_check_no_locks_freed((void *)sem, sizeof(*sem));  	lockdep_init_map(&sem->dep_map, name, key, 0);  #endif +#ifdef CONFIG_DEBUG_RWSEMS +	sem->magic = sem; +#endif  	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);  	raw_spin_lock_init(&sem->wait_lock);  	INIT_LIST_HEAD(&sem->wait_list); @@ -724,11 +728,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)  	rcu_read_lock();  	for (;;) { -		if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { -			state = OWNER_NONSPINNABLE; -			break; -		} - +		/* +		 * When a waiting writer set the handoff flag, it may spin +		 * on the owner as well. Once that writer acquires the lock, +		 * we can spin on it. So we don't need to quit even when the +		 * handoff bit is set. +		 */  		new = rwsem_owner_flags(sem, &new_flags);  		if ((new != owner) || (new_flags != flags)) {  			state = rwsem_owner_state(new, new_flags, nonspinnable); @@ -974,6 +979,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,  {  	return false;  } + +static inline int +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ +	return 0; +} +#define OWNER_NULL	1  #endif  /* @@ -1206,6 +1218,18 @@ wait:  		raw_spin_unlock_irq(&sem->wait_lock); +		/* +		 * After setting the handoff bit and failing to acquire +		 * the lock, attempt to spin on owner to accelerate lock +		 * transfer. If the previous owner is a on-cpu writer and it +		 * has just released the lock, OWNER_NULL will be returned. +		 * In this case, we attempt to acquire the lock again +		 * without sleeping. +		 */ +		if ((wstate == WRITER_HANDOFF) && +		    (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) +			goto trylock_again; +  		/* Block until there are no active lockers. */  		for (;;) {  			if (signal_pending_state(state, current)) @@ -1240,7 +1264,7 @@ wait:  				break;  			}  		} - +trylock_again:  		raw_spin_lock_irq(&sem->wait_lock);  	}  	__set_current_state(TASK_RUNNING); @@ -1338,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem)  static inline int __down_read_trylock(struct rw_semaphore *sem)  { +	long tmp; + +	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); +  	/*  	 * Optimize for the case when the rwsem is not locked at all.  	 */ -	long tmp = RWSEM_UNLOCKED_VALUE; - +	tmp = RWSEM_UNLOCKED_VALUE;  	do {  		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,  					tmp + RWSEM_READER_BIAS)) { @@ -1383,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem)  static inline int __down_write_trylock(struct rw_semaphore *sem)  { -	long tmp = RWSEM_UNLOCKED_VALUE; +	long tmp; +	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + +	tmp  = RWSEM_UNLOCKED_VALUE;  	if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,  					    RWSEM_WRITER_LOCKED)) {  		rwsem_set_owner(sem); @@ -1400,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem)  {  	long tmp; +	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);  	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); +  	rwsem_clear_reader_owned(sem);  	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);  	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); @@ -1418,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem)  {  	long tmp; +	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);  	/*  	 * sem->owner may differ from current if the ownership is transferred  	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.  	 */  	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&  			    !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); +  	rwsem_clear_owner(sem);  	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);  	if (unlikely(tmp & RWSEM_FLAG_WAITERS)) diff --git a/kernel/module.c b/kernel/module.c index 9ee93421269c..ff2d7359a418 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -7,6 +7,7 @@  #include <linux/export.h>  #include <linux/extable.h>  #include <linux/moduleloader.h> +#include <linux/module_signature.h>  #include <linux/trace_events.h>  #include <linux/init.h>  #include <linux/kallsyms.h> @@ -544,12 +545,20 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym)  #endif  } -static int cmp_name(const void *va, const void *vb) +static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)  { -	const char *a; -	const struct kernel_symbol *b; -	a = va; b = vb; -	return strcmp(a, kernel_symbol_name(b)); +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +	if (!sym->namespace_offset) +		return NULL; +	return offset_to_ptr(&sym->namespace_offset); +#else +	return sym->namespace; +#endif +} + +static int cmp_name(const void *name, const void *sym) +{ +	return strcmp(name, kernel_symbol_name(sym));  }  static bool find_exported_symbol_in_section(const struct symsearch *syms, @@ -1379,6 +1388,41 @@ static inline int same_magic(const char *amagic, const char *bmagic,  }  #endif /* CONFIG_MODVERSIONS */ +static char *get_modinfo(const struct load_info *info, const char *tag); +static char *get_next_modinfo(const struct load_info *info, const char *tag, +			      char *prev); + +static int verify_namespace_is_imported(const struct load_info *info, +					const struct kernel_symbol *sym, +					struct module *mod) +{ +	const char *namespace; +	char *imported_namespace; + +	namespace = kernel_symbol_namespace(sym); +	if (namespace) { +		imported_namespace = get_modinfo(info, "import_ns"); +		while (imported_namespace) { +			if (strcmp(namespace, imported_namespace) == 0) +				return 0; +			imported_namespace = get_next_modinfo( +				info, "import_ns", imported_namespace); +		} +#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS +		pr_warn( +#else +		pr_err( +#endif +			"%s: module uses symbol (%s) from namespace %s, but does not import it.\n", +			mod->name, kernel_symbol_name(sym), namespace); +#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS +		return -EINVAL; +#endif +	} +	return 0; +} + +  /* Resolve a symbol for this module.  I.e. if we find one, record usage. */  static const struct kernel_symbol *resolve_symbol(struct module *mod,  						  const struct load_info *info, @@ -1407,6 +1451,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,  		goto getname;  	} +	err = verify_namespace_is_imported(info, sym, mod); +	if (err) { +		sym = ERR_PTR(err); +		goto getname; +	} +  	err = ref_module(mod, owner);  	if (err) {  		sym = ERR_PTR(err); @@ -2481,7 +2531,8 @@ static char *next_string(char *string, unsigned long *secsize)  	return string;  } -static char *get_modinfo(struct load_info *info, const char *tag) +static char *get_next_modinfo(const struct load_info *info, const char *tag, +			      char *prev)  {  	char *p;  	unsigned int taglen = strlen(tag); @@ -2492,13 +2543,25 @@ static char *get_modinfo(struct load_info *info, const char *tag)  	 * get_modinfo() calls made before rewrite_section_headers()  	 * must use sh_offset, as sh_addr isn't set!  	 */ -	for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) { +	char *modinfo = (char *)info->hdr + infosec->sh_offset; + +	if (prev) { +		size -= prev - modinfo; +		modinfo = next_string(prev, &size); +	} + +	for (p = modinfo; p; p = next_string(p, &size)) {  		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')  			return p + taglen + 1;  	}  	return NULL;  } +static char *get_modinfo(const struct load_info *info, const char *tag) +{ +	return get_next_modinfo(info, tag, NULL); +} +  static void setup_modinfo(struct module *mod, struct load_info *info)  {  	struct module_attribute *attr; @@ -2776,8 +2839,9 @@ static inline void kmemleak_load_module(const struct module *mod,  #ifdef CONFIG_MODULE_SIG  static int module_sig_check(struct load_info *info, int flags)  { -	int err = -ENOKEY; +	int err = -ENODATA;  	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; +	const char *reason;  	const void *mod = info->hdr;  	/* @@ -2792,16 +2856,38 @@ static int module_sig_check(struct load_info *info, int flags)  		err = mod_verify_sig(mod, info);  	} -	if (!err) { +	switch (err) { +	case 0:  		info->sig_ok = true;  		return 0; -	} -	/* Not having a signature is only an error if we're strict. */ -	if (err == -ENOKEY && !is_module_sig_enforced()) -		err = 0; +		/* We don't permit modules to be loaded into trusted kernels +		 * without a valid signature on them, but if we're not +		 * enforcing, certain errors are non-fatal. +		 */ +	case -ENODATA: +		reason = "Loading of unsigned module"; +		goto decide; +	case -ENOPKG: +		reason = "Loading of module with unsupported crypto"; +		goto decide; +	case -ENOKEY: +		reason = "Loading of module with unavailable key"; +	decide: +		if (is_module_sig_enforced()) { +			pr_notice("%s is rejected\n", reason); +			return -EKEYREJECTED; +		} -	return err; +		return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); + +		/* All other errors are fatal, including nomem, unparseable +		 * signatures and signature check failures - even if signatures +		 * aren't required. +		 */ +	default: +		return err; +	}  }  #else /* !CONFIG_MODULE_SIG */  static int module_sig_check(struct load_info *info, int flags) diff --git a/kernel/module_signature.c b/kernel/module_signature.c new file mode 100644 index 000000000000..4224a1086b7d --- /dev/null +++ b/kernel/module_signature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/errno.h> +#include <linux/printk.h> +#include <linux/module_signature.h> +#include <asm/byteorder.h> + +/** + * mod_check_sig - check that the given signature is sane + * + * @ms:		Signature to check. + * @file_len:	Size of the file to which @ms is appended. + * @name:	What is being checked. Used for error messages. + */ +int mod_check_sig(const struct module_signature *ms, size_t file_len, +		  const char *name) +{ +	if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms)) +		return -EBADMSG; + +	if (ms->id_type != PKEY_ID_PKCS7) { +		pr_err("%s: Module is not signed with expected PKCS#7 message\n", +		       name); +		return -ENOPKG; +	} + +	if (ms->algo != 0 || +	    ms->hash != 0 || +	    ms->signer_len != 0 || +	    ms->key_id_len != 0 || +	    ms->__pad[0] != 0 || +	    ms->__pad[1] != 0 || +	    ms->__pad[2] != 0) { +		pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", +		       name); +		return -EBADMSG; +	} + +	return 0; +} diff --git a/kernel/module_signing.c b/kernel/module_signing.c index b10fb1986ca9..9d9fc678c91d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -7,37 +7,13 @@  #include <linux/kernel.h>  #include <linux/errno.h> +#include <linux/module.h> +#include <linux/module_signature.h>  #include <linux/string.h>  #include <linux/verification.h>  #include <crypto/public_key.h>  #include "module-internal.h" -enum pkey_id_type { -	PKEY_ID_PGP,		/* OpenPGP generated key ID */ -	PKEY_ID_X509,		/* X.509 arbitrary subjectKeyIdentifier */ -	PKEY_ID_PKCS7,		/* Signature in PKCS#7 message */ -}; - -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - *	- Signer's name - *	- Key identifier - *	- Signature data - *	- Information block - */ -struct module_signature { -	u8	algo;		/* Public-key crypto algorithm [0] */ -	u8	hash;		/* Digest algorithm [0] */ -	u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */ -	u8	signer_len;	/* Length of signer's name [0] */ -	u8	key_id_len;	/* Length of key identifier [0] */ -	u8	__pad[3]; -	__be32	sig_len;	/* Length of signature data */ -}; -  /*   * Verify the signature on a module.   */ @@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)  {  	struct module_signature ms;  	size_t sig_len, modlen = info->len; +	int ret;  	pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info)  		return -EBADMSG;  	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); -	modlen -= sizeof(ms); + +	ret = mod_check_sig(&ms, modlen, info->name); +	if (ret) +		return ret;  	sig_len = be32_to_cpu(ms.sig_len); -	if (sig_len >= modlen) -		return -EBADMSG; -	modlen -= sig_len; +	modlen -= sig_len + sizeof(ms);  	info->len = modlen; -	if (ms.id_type != PKEY_ID_PKCS7) { -		pr_err("%s: Module is not signed with expected PKCS#7 message\n", -		       info->name); -		return -ENOPKG; -	} - -	if (ms.algo != 0 || -	    ms.hash != 0 || -	    ms.signer_len != 0 || -	    ms.key_id_len != 0 || -	    ms.__pad[0] != 0 || -	    ms.__pad[1] != 0 || -	    ms.__pad[2] != 0) { -		pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", -		       info->name); -		return -EBADMSG; -	} -  	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,  				      VERIFY_USE_SECONDARY_KEYRING,  				      VERIFYING_MODULE_SIGNATURE, diff --git a/kernel/padata.c b/kernel/padata.c index 15a8ad63f4ff..c3fec1413295 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,18 +46,13 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)  	return target_cpu;  } -static int padata_cpu_hash(struct parallel_data *pd) +static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)  { -	unsigned int seq_nr; -	int cpu_index; -  	/*  	 * Hash the sequence numbers to the cpus by taking  	 * seq_nr mod. number of cpus in use.  	 */ - -	seq_nr = atomic_inc_return(&pd->seq_nr); -	cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); +	int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);  	return padata_index_to_cpu(pd, cpu_index);  } @@ -94,17 +89,19 @@ static void padata_parallel_worker(struct work_struct *parallel_work)   *   * @pinst: padata instance   * @padata: object to be parallelized - * @cb_cpu: cpu the serialization callback function will run on, - *          must be in the serial cpumask of padata(i.e. cpumask.cbcpu). + * @cb_cpu: pointer to the CPU that the serialization callback function should + *          run on.  If it's not in the serial cpumask of @pinst + *          (i.e. cpumask.cbcpu), this function selects a fallback CPU and if + *          none found, returns -EINVAL.   *   * The parallelization callback function will run with BHs off.   * Note: Every object which is parallelized by padata_do_parallel   * must be seen by padata_do_serial.   */  int padata_do_parallel(struct padata_instance *pinst, -		       struct padata_priv *padata, int cb_cpu) +		       struct padata_priv *padata, int *cb_cpu)  { -	int target_cpu, err; +	int i, cpu, cpu_index, target_cpu, err;  	struct padata_parallel_queue *queue;  	struct parallel_data *pd; @@ -116,8 +113,19 @@ int padata_do_parallel(struct padata_instance *pinst,  	if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)  		goto out; -	if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) -		goto out; +	if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) { +		if (!cpumask_weight(pd->cpumask.cbcpu)) +			goto out; + +		/* Select an alternate fallback CPU and notify the caller. */ +		cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu); + +		cpu = cpumask_first(pd->cpumask.cbcpu); +		for (i = 0; i < cpu_index; i++) +			cpu = cpumask_next(cpu, pd->cpumask.cbcpu); + +		*cb_cpu = cpu; +	}  	err =  -EBUSY;  	if ((pinst->flags & PADATA_RESET)) @@ -129,9 +137,10 @@ int padata_do_parallel(struct padata_instance *pinst,  	err = 0;  	atomic_inc(&pd->refcnt);  	padata->pd = pd; -	padata->cb_cpu = cb_cpu; +	padata->cb_cpu = *cb_cpu; -	target_cpu = padata_cpu_hash(pd); +	padata->seq_nr = atomic_inc_return(&pd->seq_nr); +	target_cpu = padata_cpu_hash(pd, padata->seq_nr);  	padata->cpu = target_cpu;  	queue = per_cpu_ptr(pd->pqueue, target_cpu); @@ -139,7 +148,7 @@ int padata_do_parallel(struct padata_instance *pinst,  	list_add_tail(&padata->list, &queue->parallel.list);  	spin_unlock(&queue->parallel.lock); -	queue_work_on(target_cpu, pinst->wq, &queue->work); +	queue_work(pinst->parallel_wq, &queue->work);  out:  	rcu_read_unlock_bh(); @@ -149,63 +158,53 @@ out:  EXPORT_SYMBOL(padata_do_parallel);  /* - * padata_get_next - Get the next object that needs serialization. + * padata_find_next - Find the next object that needs serialization.   *   * Return values are:   *   * A pointer to the control struct of the next object that needs   * serialization, if present in one of the percpu reorder queues.   * - * -EINPROGRESS, if the next object that needs serialization will + * NULL, if the next object that needs serialization will   *  be parallel processed by another cpu and is not yet present in   *  the cpu's reorder queue. - * - * -ENODATA, if this cpu has to do the parallel processing for - *  the next object.   */ -static struct padata_priv *padata_get_next(struct parallel_data *pd) +static struct padata_priv *padata_find_next(struct parallel_data *pd, +					    bool remove_object)  { -	int cpu, num_cpus; -	unsigned int next_nr, next_index;  	struct padata_parallel_queue *next_queue;  	struct padata_priv *padata;  	struct padata_list *reorder; +	int cpu = pd->cpu; -	num_cpus = cpumask_weight(pd->cpumask.pcpu); - -	/* -	 * Calculate the percpu reorder queue and the sequence -	 * number of the next object. -	 */ -	next_nr = pd->processed; -	next_index = next_nr % num_cpus; -	cpu = padata_index_to_cpu(pd, next_index);  	next_queue = per_cpu_ptr(pd->pqueue, cpu); -  	reorder = &next_queue->reorder;  	spin_lock(&reorder->lock); -	if (!list_empty(&reorder->list)) { -		padata = list_entry(reorder->list.next, -				    struct padata_priv, list); - -		list_del_init(&padata->list); -		atomic_dec(&pd->reorder_objects); +	if (list_empty(&reorder->list)) { +		spin_unlock(&reorder->lock); +		return NULL; +	} -		pd->processed++; +	padata = list_entry(reorder->list.next, struct padata_priv, list); +	/* +	 * Checks the rare case where two or more parallel jobs have hashed to +	 * the same CPU and one of the later ones finishes first. +	 */ +	if (padata->seq_nr != pd->processed) {  		spin_unlock(&reorder->lock); -		goto out; +		return NULL;  	} -	spin_unlock(&reorder->lock); -	if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { -		padata = ERR_PTR(-ENODATA); -		goto out; +	if (remove_object) { +		list_del_init(&padata->list); +		atomic_dec(&pd->reorder_objects); +		++pd->processed; +		pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);  	} -	padata = ERR_PTR(-EINPROGRESS); -out: +	spin_unlock(&reorder->lock);  	return padata;  } @@ -215,6 +214,7 @@ static void padata_reorder(struct parallel_data *pd)  	struct padata_priv *padata;  	struct padata_serial_queue *squeue;  	struct padata_instance *pinst = pd->pinst; +	struct padata_parallel_queue *next_queue;  	/*  	 * We need to ensure that only one cpu can work on dequeueing of @@ -230,27 +230,16 @@ static void padata_reorder(struct parallel_data *pd)  		return;  	while (1) { -		padata = padata_get_next(pd); +		padata = padata_find_next(pd, true);  		/*  		 * If the next object that needs serialization is parallel  		 * processed by another cpu and is still on it's way to the  		 * cpu's reorder queue, nothing to do for now.  		 */ -		if (PTR_ERR(padata) == -EINPROGRESS) +		if (!padata)  			break; -		/* -		 * This cpu has to do the parallel processing of the next -		 * object. It's waiting in the cpu's parallelization queue, -		 * so exit immediately. -		 */ -		if (PTR_ERR(padata) == -ENODATA) { -			del_timer(&pd->timer); -			spin_unlock_bh(&pd->lock); -			return; -		} -  		cb_cpu = padata->cb_cpu;  		squeue = per_cpu_ptr(pd->squeue, cb_cpu); @@ -258,77 +247,37 @@ static void padata_reorder(struct parallel_data *pd)  		list_add_tail(&padata->list, &squeue->serial.list);  		spin_unlock(&squeue->serial.lock); -		queue_work_on(cb_cpu, pinst->wq, &squeue->work); +		queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);  	}  	spin_unlock_bh(&pd->lock);  	/*  	 * The next object that needs serialization might have arrived to -	 * the reorder queues in the meantime, we will be called again -	 * from the timer function if no one else cares for it. +	 * the reorder queues in the meantime.  	 * -	 * Ensure reorder_objects is read after pd->lock is dropped so we see -	 * an increment from another task in padata_do_serial.  Pairs with +	 * Ensure reorder queue is read after pd->lock is dropped so we see +	 * new objects from another task in padata_do_serial.  Pairs with  	 * smp_mb__after_atomic in padata_do_serial.  	 */  	smp_mb(); -	if (atomic_read(&pd->reorder_objects) -			&& !(pinst->flags & PADATA_RESET)) -		mod_timer(&pd->timer, jiffies + HZ); -	else -		del_timer(&pd->timer); -	return; +	next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); +	if (!list_empty(&next_queue->reorder.list) && +	    padata_find_next(pd, false)) +		queue_work(pinst->serial_wq, &pd->reorder_work);  }  static void invoke_padata_reorder(struct work_struct *work)  { -	struct padata_parallel_queue *pqueue;  	struct parallel_data *pd;  	local_bh_disable(); -	pqueue = container_of(work, struct padata_parallel_queue, reorder_work); -	pd = pqueue->pd; +	pd = container_of(work, struct parallel_data, reorder_work);  	padata_reorder(pd);  	local_bh_enable();  } -static void padata_reorder_timer(struct timer_list *t) -{ -	struct parallel_data *pd = from_timer(pd, t, timer); -	unsigned int weight; -	int target_cpu, cpu; - -	cpu = get_cpu(); - -	/* We don't lock pd here to not interfere with parallel processing -	 * padata_reorder() calls on other CPUs. We just need any CPU out of -	 * the cpumask.pcpu set. It would be nice if it's the right one but -	 * it doesn't matter if we're off to the next one by using an outdated -	 * pd->processed value. -	 */ -	weight = cpumask_weight(pd->cpumask.pcpu); -	target_cpu = padata_index_to_cpu(pd, pd->processed % weight); - -	/* ensure to call the reorder callback on the correct CPU */ -	if (cpu != target_cpu) { -		struct padata_parallel_queue *pqueue; -		struct padata_instance *pinst; - -		/* The timer function is serialized wrt itself -- no locking -		 * needed. -		 */ -		pinst = pd->pinst; -		pqueue = per_cpu_ptr(pd->pqueue, target_cpu); -		queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); -	} else { -		padata_reorder(pd); -	} - -	put_cpu(); -} -  static void padata_serial_worker(struct work_struct *serial_work)  {  	struct padata_serial_queue *squeue; @@ -367,47 +316,28 @@ static void padata_serial_worker(struct work_struct *serial_work)   */  void padata_do_serial(struct padata_priv *padata)  { -	int cpu; -	struct padata_parallel_queue *pqueue; -	struct parallel_data *pd; -	int reorder_via_wq = 0; - -	pd = padata->pd; - -	cpu = get_cpu(); - -	/* We need to run on the same CPU padata_do_parallel(.., padata, ..) -	 * was called on -- or, at least, enqueue the padata object into the -	 * correct per-cpu queue. -	 */ -	if (cpu != padata->cpu) { -		reorder_via_wq = 1; -		cpu = padata->cpu; -	} - -	pqueue = per_cpu_ptr(pd->pqueue, cpu); +	struct parallel_data *pd = padata->pd; +	struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, +							   padata->cpu); +	struct padata_priv *cur;  	spin_lock(&pqueue->reorder.lock); +	/* Sort in ascending order of sequence number. */ +	list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) +		if (cur->seq_nr < padata->seq_nr) +			break; +	list_add(&padata->list, &cur->list);  	atomic_inc(&pd->reorder_objects); -	list_add_tail(&padata->list, &pqueue->reorder.list);  	spin_unlock(&pqueue->reorder.lock);  	/* -	 * Ensure the atomic_inc of reorder_objects above is ordered correctly +	 * Ensure the addition to the reorder list is ordered correctly  	 * with the trylock of pd->lock in padata_reorder.  Pairs with smp_mb  	 * in padata_reorder.  	 */  	smp_mb__after_atomic(); -	put_cpu(); - -	/* If we're running on the wrong CPU, call padata_reorder() via a -	 * kernel worker. -	 */ -	if (reorder_via_wq) -		queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); -	else -		padata_reorder(pd); +	padata_reorder(pd);  }  EXPORT_SYMBOL(padata_do_serial); @@ -415,17 +345,36 @@ static int padata_setup_cpumasks(struct parallel_data *pd,  				 const struct cpumask *pcpumask,  				 const struct cpumask *cbcpumask)  { -	if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) -		return -ENOMEM; +	struct workqueue_attrs *attrs; +	int err = -ENOMEM; +	if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) +		goto out;  	cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); -	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { -		free_cpumask_var(pd->cpumask.pcpu); -		return -ENOMEM; -	} +	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) +		goto free_pcpu_mask;  	cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); + +	attrs = alloc_workqueue_attrs(); +	if (!attrs) +		goto free_cbcpu_mask; + +	/* Restrict parallel_wq workers to pd->cpumask.pcpu. */ +	cpumask_copy(attrs->cpumask, pd->cpumask.pcpu); +	err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs); +	free_workqueue_attrs(attrs); +	if (err < 0) +		goto free_cbcpu_mask; +  	return 0; + +free_cbcpu_mask: +	free_cpumask_var(pd->cpumask.cbcpu); +free_pcpu_mask: +	free_cpumask_var(pd->cpumask.pcpu); +out: +	return err;  }  static void __padata_list_init(struct padata_list *pd_list) @@ -451,26 +400,15 @@ static void padata_init_squeues(struct parallel_data *pd)  /* Initialize all percpu queues used by parallel workers */  static void padata_init_pqueues(struct parallel_data *pd)  { -	int cpu_index, cpu; +	int cpu;  	struct padata_parallel_queue *pqueue; -	cpu_index = 0; -	for_each_possible_cpu(cpu) { +	for_each_cpu(cpu, pd->cpumask.pcpu) {  		pqueue = per_cpu_ptr(pd->pqueue, cpu); -		if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { -			pqueue->cpu_index = -1; -			continue; -		} - -		pqueue->pd = pd; -		pqueue->cpu_index = cpu_index; -		cpu_index++; -  		__padata_list_init(&pqueue->reorder);  		__padata_list_init(&pqueue->parallel);  		INIT_WORK(&pqueue->work, padata_parallel_worker); -		INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);  		atomic_set(&pqueue->num_obj, 0);  	}  } @@ -493,17 +431,19 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,  	pd->squeue = alloc_percpu(struct padata_serial_queue);  	if (!pd->squeue)  		goto err_free_pqueue; + +	pd->pinst = pinst;  	if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)  		goto err_free_squeue;  	padata_init_pqueues(pd);  	padata_init_squeues(pd); -	timer_setup(&pd->timer, padata_reorder_timer, 0);  	atomic_set(&pd->seq_nr, -1);  	atomic_set(&pd->reorder_objects, 0);  	atomic_set(&pd->refcnt, 0); -	pd->pinst = pinst;  	spin_lock_init(&pd->lock); +	pd->cpu = cpumask_first(pd->cpumask.pcpu); +	INIT_WORK(&pd->reorder_work, invoke_padata_reorder);  	return pd; @@ -538,8 +478,6 @@ static void padata_flush_queues(struct parallel_data *pd)  		flush_work(&pqueue->work);  	} -	del_timer_sync(&pd->timer); -  	if (atomic_read(&pd->reorder_objects))  		padata_reorder(pd); @@ -883,6 +821,8 @@ static void __padata_free(struct padata_instance *pinst)  	padata_free_pd(pinst->pd);  	free_cpumask_var(pinst->cpumask.pcpu);  	free_cpumask_var(pinst->cpumask.cbcpu); +	destroy_workqueue(pinst->serial_wq); +	destroy_workqueue(pinst->parallel_wq);  	kfree(pinst);  } @@ -1016,13 +956,11 @@ static struct kobj_type padata_attr_type = {   * padata_alloc - allocate and initialize a padata instance and specify   *                cpumasks for serial and parallel workers.   * - * @wq: workqueue to use for the allocated padata instance + * @name: used to identify the instance   * @pcpumask: cpumask that will be used for padata parallelization   * @cbcpumask: cpumask that will be used for padata serialization - * - * Must be called from a cpus_read_lock() protected region   */ -static struct padata_instance *padata_alloc(struct workqueue_struct *wq, +static struct padata_instance *padata_alloc(const char *name,  					    const struct cpumask *pcpumask,  					    const struct cpumask *cbcpumask)  { @@ -1033,11 +971,23 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,  	if (!pinst)  		goto err; -	if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) +	pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_UNBOUND, 0, +					     name); +	if (!pinst->parallel_wq)  		goto err_free_inst; + +	get_online_cpus(); + +	pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | +					   WQ_CPU_INTENSIVE, 1, name); +	if (!pinst->serial_wq) +		goto err_put_cpus; + +	if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) +		goto err_free_serial_wq;  	if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {  		free_cpumask_var(pinst->cpumask.pcpu); -		goto err_free_inst; +		goto err_free_serial_wq;  	}  	if (!padata_validate_cpumask(pinst, pcpumask) ||  	    !padata_validate_cpumask(pinst, cbcpumask)) @@ -1049,8 +999,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,  	rcu_assign_pointer(pinst->pd, pd); -	pinst->wq = wq; -  	cpumask_copy(pinst->cpumask.pcpu, pcpumask);  	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); @@ -1063,11 +1011,19 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,  #ifdef CONFIG_HOTPLUG_CPU  	cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);  #endif + +	put_online_cpus(); +  	return pinst;  err_free_masks:  	free_cpumask_var(pinst->cpumask.pcpu);  	free_cpumask_var(pinst->cpumask.cbcpu); +err_free_serial_wq: +	destroy_workqueue(pinst->serial_wq); +err_put_cpus: +	put_online_cpus(); +	destroy_workqueue(pinst->parallel_wq);  err_free_inst:  	kfree(pinst);  err: @@ -1079,14 +1035,11 @@ err:   *                         Use the cpu_possible_mask for serial and   *                         parallel workers.   * - * @wq: workqueue to use for the allocated padata instance - * - * Must be called from a cpus_read_lock() protected region + * @name: used to identify the instance   */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(const char *name)  { -	lockdep_assert_cpus_held(); -	return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +	return padata_alloc(name, cpu_possible_mask, cpu_possible_mask);  }  EXPORT_SYMBOL(padata_alloc_possible); diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@  #include <linux/debug_locks.h>  #include <linux/sched/debug.h>  #include <linux/interrupt.h> +#include <linux/kgdb.h>  #include <linux/kmsg_dump.h>  #include <linux/kallsyms.h>  #include <linux/notifier.h> @@ -220,6 +221,13 @@ void panic(const char *fmt, ...)  #endif  	/* +	 * If kgdb is enabled, give it a chance to run before we stop all +	 * the other CPUs or else we won't be able to debug processes left +	 * running on them. +	 */ +	kgdb_panic(buf); + +	/*  	 * If we have crashed and we have a crash kernel loaded let it handle  	 * everything else.  	 * If we want to run this after calling panic_notifiers, pass @@ -551,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,  {  	disable_trace_on_warning(); -	if (args) -		pr_warn(CUT_HERE); -  	if (file)  		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",  			raw_smp_processor_id(), current->pid, file, line, @@ -591,37 +596,26 @@ void __warn(const char *file, int line, void *caller, unsigned taint,  	add_taint(taint, LOCKDEP_STILL_OK);  } -#ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +#ifndef __WARN_FLAGS +void warn_slowpath_fmt(const char *file, int line, unsigned taint, +		       const char *fmt, ...)  {  	struct warn_args args; -	args.fmt = fmt; -	va_start(args.args, fmt); -	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, -	       &args); -	va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); +	pr_warn(CUT_HERE); -void warn_slowpath_fmt_taint(const char *file, int line, -			     unsigned taint, const char *fmt, ...) -{ -	struct warn_args args; +	if (!fmt) { +		__warn(file, line, __builtin_return_address(0), taint, +		       NULL, NULL); +		return; +	}  	args.fmt = fmt;  	va_start(args.args, fmt);  	__warn(file, line, __builtin_return_address(0), taint, NULL, &args);  	va_end(args.args);  } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); - -void warn_slowpath_null(const char *file, int line) -{ -	pr_warn(CUT_HERE); -	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); +EXPORT_SYMBOL(warn_slowpath_fmt);  #else  void __warn_printk(const char *fmt, ...)  { diff --git a/kernel/params.c b/kernel/params.c index cf448785d058..8e56f8b12d8f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -12,6 +12,7 @@  #include <linux/err.h>  #include <linux/slab.h>  #include <linux/ctype.h> +#include <linux/security.h>  #ifdef CONFIG_SYSFS  /* Protects all built-in parameters, modules use their own param_lock */ @@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b)  	return parameqn(a, b, strlen(a)+1);  } -static void param_check_unsafe(const struct kernel_param *kp) +static bool param_check_unsafe(const struct kernel_param *kp)  { +	if (kp->flags & KERNEL_PARAM_FL_HWPARAM && +	    security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) +		return false; +  	if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {  		pr_notice("Setting dangerous option %s - tainting kernel\n",  			  kp->name);  		add_taint(TAINT_USER, LOCKDEP_STILL_OK);  	} + +	return true;  }  static int parse_one(char *param, @@ -132,8 +139,10 @@ static int parse_one(char *param,  			pr_debug("handling %s with %p\n", param,  				params[i].ops->set);  			kernel_param_lock(params[i].mod); -			param_check_unsafe(¶ms[i]); -			err = params[i].ops->set(val, ¶ms[i]); +			if (param_check_unsafe(¶ms[i])) +				err = params[i].ops->set(val, ¶ms[i]); +			else +				err = -EPERM;  			kernel_param_unlock(params[i].mod);  			return err;  		} @@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,  		return -EPERM;  	kernel_param_lock(mk->mod); -	param_check_unsafe(attribute->param); -	err = attribute->param->ops->set(buf, attribute->param); +	if (param_check_unsafe(attribute->param)) +		err = attribute->param->ops->set(buf, attribute->param); +	else +		err = -EPERM;  	kernel_param_unlock(mk->mod);  	if (!err)  		return len; diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 41e83a779e19..9af5a50d3489 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -116,7 +116,7 @@ int pm_autosleep_set_state(suspend_state_t state)  int __init pm_autosleep_init(void)  { -	autosleep_ws = wakeup_source_register("autosleep"); +	autosleep_ws = wakeup_source_register(NULL, "autosleep");  	if (!autosleep_ws)  		return -ENOMEM; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index cd7434e6000d..3c0a5a8170b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -30,6 +30,7 @@  #include <linux/ctype.h>  #include <linux/genhd.h>  #include <linux/ktime.h> +#include <linux/security.h>  #include <trace/events/power.h>  #include "power.h" @@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops;  bool hibernation_available(void)  { -	return (nohibernate == 0); +	return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);  }  /** diff --git a/kernel/power/main.c b/kernel/power/main.c index bdbd605c4215..e8710d179b35 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -254,7 +254,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,  power_attr(pm_test);  #endif /* CONFIG_PM_SLEEP_DEBUG */ -#ifdef CONFIG_DEBUG_FS  static char *suspend_step_name(enum suspend_stat_step step)  {  	switch (step) { @@ -275,6 +274,92 @@ static char *suspend_step_name(enum suspend_stat_step step)  	}  } +#define suspend_attr(_name)					\ +static ssize_t _name##_show(struct kobject *kobj,		\ +		struct kobj_attribute *attr, char *buf)		\ +{								\ +	return sprintf(buf, "%d\n", suspend_stats._name);	\ +}								\ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_attr(success); +suspend_attr(fail); +suspend_attr(failed_freeze); +suspend_attr(failed_prepare); +suspend_attr(failed_suspend); +suspend_attr(failed_suspend_late); +suspend_attr(failed_suspend_noirq); +suspend_attr(failed_resume); +suspend_attr(failed_resume_early); +suspend_attr(failed_resume_noirq); + +static ssize_t last_failed_dev_show(struct kobject *kobj, +		struct kobj_attribute *attr, char *buf) +{ +	int index; +	char *last_failed_dev = NULL; + +	index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; +	index %= REC_FAILED_NUM; +	last_failed_dev = suspend_stats.failed_devs[index]; + +	return sprintf(buf, "%s\n", last_failed_dev); +} +static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); + +static ssize_t last_failed_errno_show(struct kobject *kobj, +		struct kobj_attribute *attr, char *buf) +{ +	int index; +	int last_failed_errno; + +	index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; +	index %= REC_FAILED_NUM; +	last_failed_errno = suspend_stats.errno[index]; + +	return sprintf(buf, "%d\n", last_failed_errno); +} +static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); + +static ssize_t last_failed_step_show(struct kobject *kobj, +		struct kobj_attribute *attr, char *buf) +{ +	int index; +	enum suspend_stat_step step; +	char *last_failed_step = NULL; + +	index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; +	index %= REC_FAILED_NUM; +	step = suspend_stats.failed_steps[index]; +	last_failed_step = suspend_step_name(step); + +	return sprintf(buf, "%s\n", last_failed_step); +} +static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); + +static struct attribute *suspend_attrs[] = { +	&success.attr, +	&fail.attr, +	&failed_freeze.attr, +	&failed_prepare.attr, +	&failed_suspend.attr, +	&failed_suspend_late.attr, +	&failed_suspend_noirq.attr, +	&failed_resume.attr, +	&failed_resume_early.attr, +	&failed_resume_noirq.attr, +	&last_failed_dev.attr, +	&last_failed_errno.attr, +	&last_failed_step.attr, +	NULL, +}; + +static struct attribute_group suspend_attr_group = { +	.name = "suspend_stats", +	.attrs = suspend_attrs, +}; + +#ifdef CONFIG_DEBUG_FS  static int suspend_stats_show(struct seq_file *s, void *unused)  {  	int i, index, last_dev, last_errno, last_step; @@ -495,7 +580,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)  	len = p ? p - buf : n;  	/* Check hibernation first. */ -	if (len == 4 && !strncmp(buf, "disk", len)) +	if (len == 4 && str_has_prefix(buf, "disk"))  		return PM_SUSPEND_MAX;  #ifdef CONFIG_SUSPEND @@ -794,6 +879,14 @@ static const struct attribute_group attr_group = {  	.attrs = g,  }; +static const struct attribute_group *attr_groups[] = { +	&attr_group, +#ifdef CONFIG_PM_SLEEP +	&suspend_attr_group, +#endif +	NULL, +}; +  struct workqueue_struct *pm_wq;  EXPORT_SYMBOL_GPL(pm_wq); @@ -815,7 +908,7 @@ static int __init pm_init(void)  	power_kobj = kobject_create_and_add("power", NULL);  	if (!power_kobj)  		return -ENOMEM; -	error = sysfs_create_group(power_kobj, &attr_group); +	error = sysfs_create_groups(power_kobj, attr_groups);  	if (error)  		return error;  	pm_print_times_init(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 33e3febaba53..9568a2fe7c11 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -78,57 +78,9 @@ static struct pm_qos_object cpu_dma_pm_qos = {  	.name = "cpu_dma_latency",  }; -static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_constraints network_lat_constraints = { -	.list = PLIST_HEAD_INIT(network_lat_constraints.list), -	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, -	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, -	.no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, -	.type = PM_QOS_MIN, -	.notifiers = &network_lat_notifier, -}; -static struct pm_qos_object network_lat_pm_qos = { -	.constraints = &network_lat_constraints, -	.name = "network_latency", -}; - - -static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_constraints network_tput_constraints = { -	.list = PLIST_HEAD_INIT(network_tput_constraints.list), -	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, -	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, -	.no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, -	.type = PM_QOS_MAX, -	.notifiers = &network_throughput_notifier, -}; -static struct pm_qos_object network_throughput_pm_qos = { -	.constraints = &network_tput_constraints, -	.name = "network_throughput", -}; - - -static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); -static struct pm_qos_constraints memory_bw_constraints = { -	.list = PLIST_HEAD_INIT(memory_bw_constraints.list), -	.target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, -	.default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, -	.no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, -	.type = PM_QOS_SUM, -	.notifiers = &memory_bandwidth_notifier, -}; -static struct pm_qos_object memory_bandwidth_pm_qos = { -	.constraints = &memory_bw_constraints, -	.name = "memory_bandwidth", -}; - -  static struct pm_qos_object *pm_qos_array[] = {  	&null_pm_qos,  	&cpu_dma_pm_qos, -	&network_lat_pm_qos, -	&network_throughput_pm_qos, -	&memory_bandwidth_pm_qos,  };  static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c874a7026e24..f3b7239f1892 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -121,43 +121,25 @@ static void s2idle_loop(void)  {  	pm_pr_dbg("suspend-to-idle\n"); +	/* +	 * Suspend-to-idle equals: +	 * frozen processes + suspended devices + idle processors. +	 * Thus s2idle_enter() should be called right after all devices have +	 * been suspended. +	 * +	 * Wakeups during the noirq suspend of devices may be spurious, so try +	 * to avoid them upfront. +	 */  	for (;;) { -		int error; - -		dpm_noirq_begin(); - -		/* -		 * Suspend-to-idle equals -		 * frozen processes + suspended devices + idle processors. -		 * Thus s2idle_enter() should be called right after -		 * all devices have been suspended. -		 * -		 * Wakeups during the noirq suspend of devices may be spurious, -		 * so prevent them from terminating the loop right away. -		 */ -		error = dpm_noirq_suspend_devices(PMSG_SUSPEND); -		if (!error) -			s2idle_enter(); -		else if (error == -EBUSY && pm_wakeup_pending()) -			error = 0; - -		if (!error && s2idle_ops && s2idle_ops->wake) +		if (s2idle_ops && s2idle_ops->wake)  			s2idle_ops->wake(); -		dpm_noirq_resume_devices(PMSG_RESUME); - -		dpm_noirq_end(); - -		if (error) -			break; - -		if (s2idle_ops && s2idle_ops->sync) -			s2idle_ops->sync(); -  		if (pm_wakeup_pending())  			break;  		pm_wakeup_clear(false); + +		s2idle_enter();  	}  	pm_pr_dbg("resume from suspend-to-idle\n"); @@ -271,14 +253,21 @@ static int platform_suspend_prepare_late(suspend_state_t state)  static int platform_suspend_prepare_noirq(suspend_state_t state)  { -	return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ? -		suspend_ops->prepare_late() : 0; +	if (state == PM_SUSPEND_TO_IDLE) +		return s2idle_ops && s2idle_ops->prepare_late ? +			s2idle_ops->prepare_late() : 0; + +	return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0;  }  static void platform_resume_noirq(suspend_state_t state)  { -	if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake) +	if (state == PM_SUSPEND_TO_IDLE) { +		if (s2idle_ops && s2idle_ops->restore_early) +			s2idle_ops->restore_early(); +	} else if (suspend_ops->wake) {  		suspend_ops->wake(); +	}  }  static void platform_resume_early(suspend_state_t state) @@ -415,11 +404,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	if (error)  		goto Devices_early_resume; -	if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) { -		s2idle_loop(); -		goto Platform_early_resume; -	} -  	error = dpm_suspend_noirq(PMSG_SUSPEND);  	if (error) {  		pr_err("noirq suspend of devices failed\n"); @@ -432,6 +416,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	if (suspend_test(TEST_PLATFORM))  		goto Platform_wake; +	if (state == PM_SUSPEND_TO_IDLE) { +		s2idle_loop(); +		goto Platform_wake; +	} +  	error = suspend_disable_secondary_cpus();  	if (error || suspend_test(TEST_CPUS))  		goto Enable_cpus; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 4210152e56f0..105df4dfc783 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(wakelocks_lock);  struct wakelock {  	char			*name;  	struct rb_node		node; -	struct wakeup_source	ws; +	struct wakeup_source	*ws;  #ifdef CONFIG_PM_WAKELOCKS_GC  	struct list_head	lru;  #endif @@ -46,7 +46,7 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)  	for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {  		wl = rb_entry(node, struct wakelock, node); -		if (wl->ws.active == show_active) +		if (wl->ws->active == show_active)  			str += scnprintf(str, end - str, "%s ", wl->name);  	}  	if (str > buf) @@ -112,16 +112,16 @@ static void __wakelocks_gc(struct work_struct *work)  		u64 idle_time_ns;  		bool active; -		spin_lock_irq(&wl->ws.lock); -		idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); -		active = wl->ws.active; -		spin_unlock_irq(&wl->ws.lock); +		spin_lock_irq(&wl->ws->lock); +		idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time)); +		active = wl->ws->active; +		spin_unlock_irq(&wl->ws->lock);  		if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))  			break;  		if (!active) { -			wakeup_source_remove(&wl->ws); +			wakeup_source_unregister(wl->ws);  			rb_erase(&wl->node, &wakelocks_tree);  			list_del(&wl->lru);  			kfree(wl->name); @@ -187,9 +187,15 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,  		kfree(wl);  		return ERR_PTR(-ENOMEM);  	} -	wl->ws.name = wl->name; -	wl->ws.last_time = ktime_get(); -	wakeup_source_add(&wl->ws); + +	wl->ws = wakeup_source_register(NULL, wl->name); +	if (!wl->ws) { +		kfree(wl->name); +		kfree(wl); +		return ERR_PTR(-ENOMEM); +	} +	wl->ws->last_time = ktime_get(); +  	rb_link_node(&wl->node, parent, node);  	rb_insert_color(&wl->node, &wakelocks_tree);  	wakelocks_lru_add(wl); @@ -233,9 +239,9 @@ int pm_wake_lock(const char *buf)  		u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;  		do_div(timeout_ms, NSEC_PER_MSEC); -		__pm_wakeup_event(&wl->ws, timeout_ms); +		__pm_wakeup_event(wl->ws, timeout_ms);  	} else { -		__pm_stay_awake(&wl->ws); +		__pm_stay_awake(wl->ws);  	}  	wakelocks_lru_most_recent(wl); @@ -271,7 +277,7 @@ int pm_wake_unlock(const char *buf)  		ret = PTR_ERR(wl);  		goto out;  	} -	__pm_relax(&wl->ws); +	__pm_relax(wl->ws);  	wakelocks_lru_most_recent(wl);  	wakelocks_gc(); diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 1d21ebacfdb8..17a9591e54ff 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -11,11 +11,18 @@  int _braille_console_setup(char **str, char **brl_options)  { -	if (!strncmp(*str, "brl,", 4)) { +	size_t len; + +	len = str_has_prefix(*str, "brl,"); +	if (len) {  		*brl_options = ""; -		*str += 4; -	} else if (!strncmp(*str, "brl=", 4)) { -		*brl_options = *str + 4; +		*str += len; +		return 0; +	} + +	len = str_has_prefix(*str, "brl="); +	if (len) { +		*brl_options = *str + len;  		*str = strchr(*brl_options, ',');  		if (!*str) {  			pr_err("need port name after brl=\n"); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1888f6a3b694..ca65327a6de8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -118,19 +118,29 @@ static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;  static int __control_devkmsg(char *str)  { +	size_t len; +  	if (!str)  		return -EINVAL; -	if (!strncmp(str, "on", 2)) { +	len = str_has_prefix(str, "on"); +	if (len) {  		devkmsg_log = DEVKMSG_LOG_MASK_ON; -		return 2; -	} else if (!strncmp(str, "off", 3)) { +		return len; +	} + +	len = str_has_prefix(str, "off"); +	if (len) {  		devkmsg_log = DEVKMSG_LOG_MASK_OFF; -		return 3; -	} else if (!strncmp(str, "ratelimit", 9)) { +		return len; +	} + +	len = str_has_prefix(str, "ratelimit"); +	if (len) {  		devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; -		return 9; +		return len;  	} +  	return -EINVAL;  } @@ -3274,7 +3284,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	/* move first record forward until length fits into the buffer */  	seq = dumper->cur_seq;  	idx = dumper->cur_idx; -	while (l > size && seq < dumper->next_seq) { +	while (l >= size && seq < dumper->next_seq) {  		struct printk_log *msg = log_from_idx(idx);  		l -= msg_print_text(msg, true, time, NULL, 0); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 480edf328b51..7644eda17d62 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem"  config TREE_RCU  	bool -	default y if !PREEMPT && SMP +	default y if !PREEMPTION && SMP  	help  	  This option selects the RCU implementation that is  	  designed for very large SMP system with hundreds or @@ -16,7 +16,7 @@ config TREE_RCU  config PREEMPT_RCU  	bool -	default y if PREEMPT +	default y if PREEMPTION  	help  	  This option selects the RCU implementation that is  	  designed for very large SMP systems with hundreds or @@ -28,7 +28,7 @@ config PREEMPT_RCU  config TINY_RCU  	bool -	default y if !PREEMPT && !SMP +	default y if !PREEMPTION && !SMP  	help  	  This option selects the RCU implementation that is  	  designed for UP systems from which real-time response @@ -70,7 +70,7 @@ config TREE_SRCU  	  This option selects the full-fledged version of SRCU.  config TASKS_RCU -	def_bool PREEMPT +	def_bool PREEMPTION  	select SRCU  	help  	  This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 5ec3ea4028e2..4aa02eee8f6c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -8,6 +8,17 @@ menu "RCU Debugging"  config PROVE_RCU  	def_bool PROVE_LOCKING +config PROVE_RCU_LIST +	bool "RCU list lockdep debugging" +	depends on PROVE_RCU && RCU_EXPERT +	default n +	help +	  Enable RCU lockdep checking for list usages. By default it is +	  turned off since there are several list RCU users that still +	  need to be converted to pass a lockdep expression. To prevent +	  false-positive splats, we keep it default disabled but once all +	  users are converted, we can remove this config option. +  config TORTURE_TEST  	tristate  	default n diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5290b01de534..8fd4f82c9b3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)  #ifdef CONFIG_RCU_STALL_COMMON +extern int rcu_cpu_stall_ftrace_dump;  extern int rcu_cpu_stall_suppress;  extern int rcu_cpu_stall_timeout;  int rcu_jiffies_till_stall_check(void); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9bd5f6023c21..495c58ce1640 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -24,6 +24,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp)  }  /* + * Enqueue an rcu_head structure onto the specified callback list. + * This function assumes that the callback is non-lazy because it + * is intended for use by no-CBs CPUs, which do not distinguish + * between lazy and non-lazy RCU callbacks. + */ +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) +{ +	*rclp->tail = rhp; +	rclp->tail = &rhp->next; +	WRITE_ONCE(rclp->len, rclp->len + 1); +} + +/* + * Flush the second rcu_cblist structure onto the first one, obliterating + * any contents of the first.  If rhp is non-NULL, enqueue it as the sole + * element of the second rcu_cblist structure, but ensuring that the second + * rcu_cblist structure, if initially non-empty, always appears non-empty + * throughout the process.  If rdp is NULL, the second rcu_cblist structure + * is instead initialized to empty. + */ +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, +			      struct rcu_cblist *srclp, +			      struct rcu_head *rhp) +{ +	drclp->head = srclp->head; +	if (drclp->head) +		drclp->tail = srclp->tail; +	else +		drclp->tail = &drclp->head; +	drclp->len = srclp->len; +	drclp->len_lazy = srclp->len_lazy; +	if (!rhp) { +		rcu_cblist_init(srclp); +	} else { +		rhp->next = NULL; +		srclp->head = rhp; +		srclp->tail = &rhp->next; +		WRITE_ONCE(srclp->len, 1); +		srclp->len_lazy = 0; +	} +} + +/*   * Dequeue the oldest rcu_head structure from the specified callback   * list.  This function assumes that the callback is non-lazy, but   * the caller can later invoke rcu_cblist_dequeued_lazy() if it @@ -44,6 +87,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)  	return rhp;  } +/* Set the length of an rcu_segcblist structure. */ +void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU +	atomic_long_set(&rsclp->len, v); +#else +	WRITE_ONCE(rsclp->len, v); +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by the + * specified amount, which can be negative.  This can cause the ->len + * field to disagree with the actual number of callbacks on the structure. + * This increase is fully ordered with respect to the callers accesses + * both before and after. + */ +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU +	smp_mb__before_atomic(); /* Up to the caller! */ +	atomic_long_add(v, &rsclp->len); +	smp_mb__after_atomic(); /* Up to the caller! */ +#else +	smp_mb(); /* Up to the caller! */ +	WRITE_ONCE(rsclp->len, rsclp->len + v); +	smp_mb(); /* Up to the caller! */ +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by one. + * This can cause the ->len field to disagree with the actual number of + * callbacks on the structure.  This increase is fully ordered with respect + * to the callers accesses both before and after. + */ +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) +{ +	rcu_segcblist_add_len(rsclp, 1); +} + +/* + * Exchange the numeric length of the specified rcu_segcblist structure + * with the specified value.  This can cause the ->len field to disagree + * with the actual number of callbacks on the structure.  This exchange is + * fully ordered with respect to the callers accesses both before and after. + */ +long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU +	return atomic_long_xchg(&rsclp->len, v); +#else +	long ret = rsclp->len; + +	smp_mb(); /* Up to the caller! */ +	WRITE_ONCE(rsclp->len, v); +	smp_mb(); /* Up to the caller! */ +	return ret; +#endif +} +  /*   * Initialize an rcu_segcblist structure.   */ @@ -56,8 +160,9 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)  	rsclp->head = NULL;  	for (i = 0; i < RCU_CBLIST_NSEGS; i++)  		rsclp->tails[i] = &rsclp->head; -	rsclp->len = 0; +	rcu_segcblist_set_len(rsclp, 0);  	rsclp->len_lazy = 0; +	rsclp->enabled = 1;  }  /* @@ -69,7 +174,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)  	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));  	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));  	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); -	rsclp->tails[RCU_NEXT_TAIL] = NULL; +	rsclp->enabled = 0; +} + +/* + * Mark the specified rcu_segcblist structure as offloaded.  This + * structure must be empty. + */ +void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +{ +	rsclp->offloaded = 1;  }  /* @@ -118,6 +232,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)  }  /* + * Return false if there are no CBs awaiting grace periods, otherwise, + * return true and store the nearest waited-upon grace period into *lp. + */ +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) +{ +	if (!rcu_segcblist_pend_cbs(rsclp)) +		return false; +	*lp = rsclp->gp_seq[RCU_WAIT_TAIL]; +	return true; +} + +/*   * Enqueue the specified callback onto the specified rcu_segcblist   * structure, updating accounting as needed.  Note that the ->len   * field may be accessed locklessly, hence the WRITE_ONCE(). @@ -129,13 +255,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)  void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,  			   struct rcu_head *rhp, bool lazy)  { -	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ +	rcu_segcblist_inc_len(rsclp);  	if (lazy)  		rsclp->len_lazy++;  	smp_mb(); /* Ensure counts are updated before callback is enqueued. */  	rhp->next = NULL; -	*rsclp->tails[RCU_NEXT_TAIL] = rhp; -	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +	WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); +	WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);  }  /* @@ -155,7 +281,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,  	if (rcu_segcblist_n_cbs(rsclp) == 0)  		return false; -	WRITE_ONCE(rsclp->len, rsclp->len + 1); +	rcu_segcblist_inc_len(rsclp);  	if (lazy)  		rsclp->len_lazy++;  	smp_mb(); /* Ensure counts are updated before callback is entrained. */ @@ -163,9 +289,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,  	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)  		if (rsclp->tails[i] != rsclp->tails[i - 1])  			break; -	*rsclp->tails[i] = rhp; +	WRITE_ONCE(*rsclp->tails[i], rhp);  	for (; i <= RCU_NEXT_TAIL; i++) -		rsclp->tails[i] = &rhp->next; +		WRITE_ONCE(rsclp->tails[i], &rhp->next);  	return true;  } @@ -182,9 +308,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,  					       struct rcu_cblist *rclp)  {  	rclp->len_lazy += rsclp->len_lazy; -	rclp->len += rsclp->len;  	rsclp->len_lazy = 0; -	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +	rclp->len = rcu_segcblist_xchg_len(rsclp, 0);  }  /* @@ -200,12 +325,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,  	if (!rcu_segcblist_ready_cbs(rsclp))  		return; /* Nothing to do. */  	*rclp->tail = rsclp->head; -	rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; -	*rsclp->tails[RCU_DONE_TAIL] = NULL; +	WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); +	WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);  	rclp->tail = rsclp->tails[RCU_DONE_TAIL];  	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)  		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) -			rsclp->tails[i] = &rsclp->head; +			WRITE_ONCE(rsclp->tails[i], &rsclp->head);  }  /* @@ -224,9 +349,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,  		return; /* Nothing to do. */  	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];  	rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; -	*rsclp->tails[RCU_DONE_TAIL] = NULL; +	WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);  	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) -		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);  }  /* @@ -237,8 +362,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,  				struct rcu_cblist *rclp)  {  	rsclp->len_lazy += rclp->len_lazy; -	/* ->len sampled locklessly. */ -	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); +	rcu_segcblist_add_len(rsclp, rclp->len);  	rclp->len_lazy = 0;  	rclp->len = 0;  } @@ -255,10 +379,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,  	if (!rclp->head)  		return; /* No callbacks to move. */  	*rclp->tail = rsclp->head; -	rsclp->head = rclp->head; +	WRITE_ONCE(rsclp->head, rclp->head);  	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)  		if (&rsclp->head == rsclp->tails[i]) -			rsclp->tails[i] = rclp->tail; +			WRITE_ONCE(rsclp->tails[i], rclp->tail);  		else  			break;  	rclp->head = NULL; @@ -274,8 +398,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,  {  	if (!rclp->head)  		return; /* Nothing to do. */ -	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head; -	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; +	WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); +	WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);  	rclp->head = NULL;  	rclp->tail = &rclp->head;  } @@ -299,7 +423,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)  	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {  		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))  			break; -		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; +		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);  	}  	/* If no callbacks moved, nothing more need be done. */ @@ -308,7 +432,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)  	/* Clean up tail pointers that might have been misordered above. */  	for (j = RCU_WAIT_TAIL; j < i; j++) -		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; +		WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);  	/*  	 * Callbacks moved, so clean up the misordered ->tails[] pointers @@ -319,7 +443,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)  	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {  		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])  			break;  /* No more callbacks. */ -		rsclp->tails[j] = rsclp->tails[i]; +		WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);  		rsclp->gp_seq[j] = rsclp->gp_seq[i];  	}  } @@ -384,7 +508,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)  	 * structure other than in the RCU_NEXT_TAIL segment.  	 */  	for (; i < RCU_NEXT_TAIL; i++) { -		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; +		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);  		rsclp->gp_seq[i] = seq;  	}  	return true; diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 71b64648464e..815c2fdd3fcc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -9,6 +9,12 @@  #include <linux/rcu_segcblist.h> +/* Return number of callbacks in the specified callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) +{ +	return READ_ONCE(rclp->len); +} +  /*   * Account for the fact that a previously dequeued callback turned out   * to be marked as lazy. @@ -19,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)  }  void rcu_cblist_init(struct rcu_cblist *rclp); +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, +			      struct rcu_cblist *srclp, +			      struct rcu_head *rhp);  struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);  /* @@ -36,13 +46,17 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);   */  static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)  { -	return !rsclp->head; +	return !READ_ONCE(rsclp->head);  }  /* Return number of callbacks in segmented callback list. */  static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)  { +#ifdef CONFIG_RCU_NOCB_CPU +	return atomic_long_read(&rsclp->len); +#else  	return READ_ONCE(rsclp->len); +#endif  }  /* Return number of lazy callbacks in segmented callback list. */ @@ -54,16 +68,22 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)  /* Return number of lazy callbacks in segmented callback list. */  static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)  { -	return rsclp->len - rsclp->len_lazy; +	return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;  }  /*   * Is the specified rcu_segcblist enabled, for example, not corresponding - * to an offline or callback-offloaded CPU? + * to an offline CPU?   */  static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)  { -	return !!rsclp->tails[RCU_NEXT_TAIL]; +	return rsclp->enabled; +} + +/* Is the specified rcu_segcblist offloaded?  */ +static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) +{ +	return rsclp->offloaded;  }  /* @@ -73,36 +93,18 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)   */  static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)  { -	return !*rsclp->tails[seg]; -} - -/* - * Interim function to return rcu_segcblist head pointer.  Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) -{ -	return rsclp->head; -} - -/* - * Interim function to return rcu_segcblist head pointer.  Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) -{ -	WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); -	return rsclp->tails[RCU_NEXT_TAIL]; +	return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));  } +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);  void rcu_segcblist_init(struct rcu_segcblist *rsclp);  void rcu_segcblist_disable(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp);  bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);  bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);  struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);  struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);  void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,  			   struct rcu_head *rhp, bool lazy);  bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a6890b23c5f..5a879d073c1c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable  static char *perf_type = "rcu";  module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");  static int nrealreaders;  static int nrealwriters; @@ -375,6 +375,14 @@ rcu_perf_writer(void *arg)  	if (holdoff)  		schedule_timeout_uninterruptible(holdoff * HZ); +	/* +	 * Wait until rcu_end_inkernel_boot() is called for normal GP tests +	 * so that RCU is not always expedited for normal GP tests. +	 * The system_state test is approximate, but works well in practice. +	 */ +	while (!gp_exp && system_state != SYSTEM_RUNNING) +		schedule_timeout_uninterruptible(1); +  	t = ktime_get_mono_fast_ns();  	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {  		t_rcu_perf_writer_started = t; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fce4e7e6f502..3c9feca1eab1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers;  static long n_barrier_attempts;  static long n_barrier_successes; /* did rcu_barrier test succeed? */  static struct list_head rcu_torture_removed; +static unsigned long shutdown_jiffies;  static int rcu_torture_writer_state;  #define RTWS_FIXED_DELAY	0 @@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void)  }  #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Stop aggressive CPU-hog tests a bit before the end of the test in order + * to avoid interfering with test shutdown. + */ +static bool shutdown_time_arrived(void) +{ +	return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ); +} +  static unsigned long boost_starttime;	/* jiffies of next boost test start. */  static DEFINE_MUTEX(boost_mutex);	/* protect setting boost_starttime */  					/*  and boost task create/destroy. */ @@ -1713,12 +1723,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)  }  // Give the scheduler a chance, even on nohz_full CPUs. -static void rcu_torture_fwd_prog_cond_resched(void) +static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)  {  	if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { -		if (need_resched()) +		// Real call_rcu() floods hit userspace, so emulate that. +		if (need_resched() || (iter & 0xfff))  			schedule();  	} else { +		// No userspace emulation: CB invocation throttles call_rcu()  		cond_resched();  	}  } @@ -1746,7 +1758,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)  		spin_unlock_irqrestore(&rcu_fwd_lock, flags);  		kfree(rfcp);  		freed++; -		rcu_torture_fwd_prog_cond_resched(); +		rcu_torture_fwd_prog_cond_resched(freed);  	}  	return freed;  } @@ -1785,15 +1797,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)  	WRITE_ONCE(rcu_fwd_startat, jiffies);  	stopat = rcu_fwd_startat + dur;  	while (time_before(jiffies, stopat) && +	       !shutdown_time_arrived() &&  	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {  		idx = cur_ops->readlock();  		udelay(10);  		cur_ops->readunlock(idx);  		if (!fwd_progress_need_resched || need_resched()) -			rcu_torture_fwd_prog_cond_resched(); +			rcu_torture_fwd_prog_cond_resched(1);  	}  	(*tested_tries)++;  	if (!time_before(jiffies, stopat) && +	    !shutdown_time_arrived() &&  	    !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {  		(*tested)++;  		cver = READ_ONCE(rcu_torture_current_version) - cver; @@ -1852,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void)  	gps = cur_ops->get_gp_seq();  	rcu_launder_gp_seq_start = gps;  	while (time_before(jiffies, stopat) && +	       !shutdown_time_arrived() &&  	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {  		rfcp = READ_ONCE(rcu_fwd_cb_head);  		rfcpn = NULL; @@ -1875,7 +1890,7 @@ static void rcu_torture_fwd_prog_cr(void)  			rfcp->rfc_gps = 0;  		}  		cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); -		rcu_torture_fwd_prog_cond_resched(); +		rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);  	}  	stoppedat = jiffies;  	n_launders_cb_snap = READ_ONCE(n_launders_cb); @@ -1884,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void)  	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */  	(void)rcu_torture_fwd_prog_cbfree(); -	if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { +	if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && +	    !shutdown_time_arrived()) {  		WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);  		pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",  			 __func__, @@ -2160,6 +2176,7 @@ rcu_torture_cleanup(void)  		return;  	} +	show_rcu_gp_kthreads();  	rcu_torture_barrier_cleanup();  	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);  	torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2465,6 +2482,7 @@ rcu_torture_init(void)  			goto unwind;  		rcutor_hp = firsterr;  	} +	shutdown_jiffies = jiffies + shutdown_secs * HZ;  	firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);  	if (firsterr)  		goto unwind; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cf0e886314f2..5dffade2d7cd 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)  		c0 = l0 - u0;  		c1 = l1 - u1; -		pr_cont(" %d(%ld,%ld %1p)", -			cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); +		pr_cont(" %d(%ld,%ld %c)", +			cpu, c0, c1, +			"C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);  		s0 += c0;  		s1 += c1;  	} diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..81105141b6a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -56,6 +56,7 @@  #include <linux/smpboot.h>  #include <linux/jiffies.h>  #include <linux/sched/isolation.h> +#include <linux/sched/clock.h>  #include "../time/tick-internal.h"  #include "tree.h" @@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu)  {  	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ +	if (rcu_segcblist_is_enabled(&rdp->cblist))  		return rcu_segcblist_n_cbs(&rdp->cblist); -	return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ +	return 0;  }  void rcu_softirq_qs(void) @@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444);  static ulong jiffies_till_first_fqs = ULONG_MAX;  static ulong jiffies_till_next_fqs = ULONG_MAX;  static bool rcu_kick_kthreads; +static int rcu_divisor = 7; +module_param(rcu_divisor, int, 0644); + +/* Force an exit from rcu_do_batch() after 3 milliseconds. */ +static long rcu_resched_ns = 3 * NSEC_PER_MSEC; +module_param(rcu_resched_ns, long, 0644);  /*   * How long the grace period must be before we start recruiting @@ -1251,6 +1258,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)  	unsigned long gp_seq_req;  	bool ret = false; +	rcu_lockdep_assert_cblist_protected(rdp);  	raw_lockdep_assert_held_rcu_node(rnp);  	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1292,7 +1300,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,  	unsigned long c;  	bool needwake; -	lockdep_assert_irqs_disabled(); +	rcu_lockdep_assert_cblist_protected(rdp);  	c = rcu_seq_snap(&rcu_state.gp_seq);  	if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {  		/* Old request still live, so mark recent callbacks. */ @@ -1318,6 +1326,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,   */  static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)  { +	rcu_lockdep_assert_cblist_protected(rdp);  	raw_lockdep_assert_held_rcu_node(rnp);  	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1335,6 +1344,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)  }  /* + * Move and classify callbacks, but only if doing so won't require + * that the RCU grace-period kthread be awakened. + */ +static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, +						  struct rcu_data *rdp) +{ +	rcu_lockdep_assert_cblist_protected(rdp); +	if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || +	    !raw_spin_trylock_rcu_node(rnp)) +		return; +	WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); +	raw_spin_unlock_rcu_node(rnp); +} + +/*   * Update CPU-local rcu_data state to record the beginnings and ends of   * grace periods.  The caller must hold the ->lock of the leaf rcu_node   * structure corresponding to the current CPU, and must have irqs disabled. @@ -1342,8 +1366,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)   */  static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)  { -	bool ret; +	bool ret = false;  	bool need_gp; +	const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +			       rcu_segcblist_is_offloaded(&rdp->cblist);  	raw_lockdep_assert_held_rcu_node(rnp); @@ -1353,10 +1379,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)  	/* Handle the ends of any preceding grace periods first. */  	if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||  	    unlikely(READ_ONCE(rdp->gpwrap))) { -		ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ +		if (!offloaded) +			ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */  		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));  	} else { -		ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ +		if (!offloaded) +			ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */  	}  	/* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1657,6 +1685,7 @@ static void rcu_gp_cleanup(void)  	unsigned long gp_duration;  	bool needgp = false;  	unsigned long new_gp_seq; +	bool offloaded;  	struct rcu_data *rdp;  	struct rcu_node *rnp = rcu_get_root();  	struct swait_queue_head *sq; @@ -1722,7 +1751,9 @@ static void rcu_gp_cleanup(void)  		needgp = true;  	}  	/* Advance CBs to reduce false positives below. */ -	if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { +	offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +		    rcu_segcblist_is_offloaded(&rdp->cblist); +	if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {  		WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);  		rcu_state.gp_req_activity = jiffies;  		trace_rcu_grace_period(rcu_state.name, @@ -1881,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)  	struct rcu_node *rnp_p;  	raw_lockdep_assert_held_rcu_node(rnp); -	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || +	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||  	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||  	    rnp->qsmask != 0) {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1916,7 +1947,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)  {  	unsigned long flags;  	unsigned long mask; -	bool needwake; +	bool needwake = false; +	const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +			       rcu_segcblist_is_offloaded(&rdp->cblist);  	struct rcu_node *rnp;  	rnp = rdp->mynode; @@ -1943,7 +1976,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)  		 * This GP can't end until cpu checks in, so all of our  		 * callbacks can be processed during the next GP.  		 */ -		needwake = rcu_accelerate_cbs(rnp, rdp); +		if (!offloaded) +			needwake = rcu_accelerate_cbs(rnp, rdp);  		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);  		/* ^^^ Released rnp->lock */ @@ -2077,9 +2111,12 @@ int rcutree_dead_cpu(unsigned int cpu)  static void rcu_do_batch(struct rcu_data *rdp)  {  	unsigned long flags; +	const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +			       rcu_segcblist_is_offloaded(&rdp->cblist);  	struct rcu_head *rhp;  	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);  	long bl, count; +	long pending, tlimit = 0;  	/* If no callbacks are ready, just return. */  	if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2099,13 +2136,19 @@ static void rcu_do_batch(struct rcu_data *rdp)  	 * callback counts, as rcu_barrier() needs to be conservative.  	 */  	local_irq_save(flags); +	rcu_nocb_lock(rdp);  	WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); -	bl = rdp->blimit; +	pending = rcu_segcblist_n_cbs(&rdp->cblist); +	bl = max(rdp->blimit, pending >> rcu_divisor); +	if (unlikely(bl > 100)) +		tlimit = local_clock() + rcu_resched_ns;  	trace_rcu_batch_start(rcu_state.name,  			      rcu_segcblist_n_lazy_cbs(&rdp->cblist),  			      rcu_segcblist_n_cbs(&rdp->cblist), bl);  	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); -	local_irq_restore(flags); +	if (offloaded) +		rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); +	rcu_nocb_unlock_irqrestore(rdp, flags);  	/* Invoke callbacks. */  	rhp = rcu_cblist_dequeue(&rcl); @@ -2117,13 +2160,29 @@ static void rcu_do_batch(struct rcu_data *rdp)  		 * Stop only if limit reached and CPU has something to do.  		 * Note: The rcl structure counts down from zero.  		 */ -		if (-rcl.len >= bl && +		if (-rcl.len >= bl && !offloaded &&  		    (need_resched() ||  		     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))  			break; +		if (unlikely(tlimit)) { +			/* only call local_clock() every 32 callbacks */ +			if (likely((-rcl.len & 31) || local_clock() < tlimit)) +				continue; +			/* Exceeded the time limit, so leave. */ +			break; +		} +		if (offloaded) { +			WARN_ON_ONCE(in_serving_softirq()); +			local_bh_enable(); +			lockdep_assert_irqs_enabled(); +			cond_resched_tasks_rcu_qs(); +			lockdep_assert_irqs_enabled(); +			local_bh_disable(); +		}  	}  	local_irq_save(flags); +	rcu_nocb_lock(rdp);  	count = -rcl.len;  	trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),  			    is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2149,12 +2208,14 @@ static void rcu_do_batch(struct rcu_data *rdp)  	 * The following usually indicates a double call_rcu().  To track  	 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.  	 */ -	WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); +	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); +	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +		     count != 0 && rcu_segcblist_empty(&rdp->cblist)); -	local_irq_restore(flags); +	rcu_nocb_unlock_irqrestore(rdp, flags);  	/* Re-invoke RCU core processing if there are callbacks remaining. */ -	if (rcu_segcblist_ready_cbs(&rdp->cblist)) +	if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))  		invoke_rcu_core();  } @@ -2205,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))  		mask = 0;  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		if (rnp->qsmask == 0) { -			if (!IS_ENABLED(CONFIG_PREEMPT) || +			if (!IS_ENABLED(CONFIG_PREEMPTION) ||  			    rcu_preempt_blocked_readers_cgp(rnp)) {  				/*  				 * No point in scanning bits because they @@ -2280,6 +2341,8 @@ static __latent_entropy void rcu_core(void)  	unsigned long flags;  	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);  	struct rcu_node *rnp = rdp->mynode; +	const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +			       rcu_segcblist_is_offloaded(&rdp->cblist);  	if (cpu_is_offline(smp_processor_id()))  		return; @@ -2299,7 +2362,7 @@ static __latent_entropy void rcu_core(void)  	/* No grace period and unregistered callbacks? */  	if (!rcu_gp_in_progress() && -	    rcu_segcblist_is_enabled(&rdp->cblist)) { +	    rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {  		local_irq_save(flags);  		if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))  			rcu_accelerate_cbs_unlocked(rnp, rdp); @@ -2309,7 +2372,7 @@ static __latent_entropy void rcu_core(void)  	rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());  	/* If there are callbacks ready, invoke them. */ -	if (rcu_segcblist_ready_cbs(&rdp->cblist) && +	if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&  	    likely(READ_ONCE(rcu_scheduler_fully_active)))  		rcu_do_batch(rdp); @@ -2489,10 +2552,11 @@ static void rcu_leak_callback(struct rcu_head *rhp)   * is expected to specify a CPU.   */  static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)  {  	unsigned long flags;  	struct rcu_data *rdp; +	bool was_alldone;  	/* Misaligned rcu_head! */  	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); @@ -2514,28 +2578,18 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)  	rdp = this_cpu_ptr(&rcu_data);  	/* Add the callback to our list. */ -	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { -		int offline; - -		if (cpu != -1) -			rdp = per_cpu_ptr(&rcu_data, cpu); -		if (likely(rdp->mynode)) { -			/* Post-boot, so this should be for a no-CBs CPU. */ -			offline = !__call_rcu_nocb(rdp, head, lazy, flags); -			WARN_ON_ONCE(offline); -			/* Offline CPU, _call_rcu() illegal, leak callback.  */ -			local_irq_restore(flags); -			return; -		} -		/* -		 * Very early boot, before rcu_init().  Initialize if needed -		 * and then drop through to queue the callback. -		 */ -		WARN_ON_ONCE(cpu != -1); +	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { +		// This can trigger due to call_rcu() from offline CPU: +		WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);  		WARN_ON_ONCE(!rcu_is_watching()); +		// Very early boot, before rcu_init().  Initialize if needed +		// and then drop through to queue the callback.  		if (rcu_segcblist_empty(&rdp->cblist))  			rcu_segcblist_init(&rdp->cblist);  	} +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) +		return; // Enqueued onto ->nocb_bypass, so just leave. +	/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */  	rcu_segcblist_enqueue(&rdp->cblist, head, lazy);  	if (__is_kfree_rcu_offset((unsigned long)func))  		trace_rcu_kfree_callback(rcu_state.name, head, @@ -2548,8 +2602,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)  				   rcu_segcblist_n_cbs(&rdp->cblist));  	/* Go handle any RCU core processing required. */ -	__call_rcu_core(rdp, head, flags); -	local_irq_restore(flags); +	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && +	    unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { +		__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ +	} else { +		__call_rcu_core(rdp, head, flags); +		local_irq_restore(flags); +	}  }  /** @@ -2589,7 +2648,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)   */  void call_rcu(struct rcu_head *head, rcu_callback_t func)  { -	__call_rcu(head, func, -1, 0); +	__call_rcu(head, func, 0);  }  EXPORT_SYMBOL_GPL(call_rcu); @@ -2602,7 +2661,7 @@ EXPORT_SYMBOL_GPL(call_rcu);   */  void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  { -	__call_rcu(head, func, -1, 1); +	__call_rcu(head, func, 1);  }  EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2622,7 +2681,7 @@ static int rcu_blocking_is_gp(void)  {  	int ret; -	if (IS_ENABLED(CONFIG_PREEMPT)) +	if (IS_ENABLED(CONFIG_PREEMPTION))  		return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;  	might_sleep();  /* Check for RCU read-side critical section. */  	preempt_disable(); @@ -2735,6 +2794,10 @@ static int rcu_pending(void)  	/* Check for CPU stalls, if enabled. */  	check_cpu_stall(rdp); +	/* Does this CPU need a deferred NOCB wakeup? */ +	if (rcu_nocb_need_deferred_wakeup(rdp)) +		return 1; +  	/* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */  	if (rcu_nohz_full_cpu())  		return 0; @@ -2750,6 +2813,8 @@ static int rcu_pending(void)  	/* Has RCU gone idle with this CPU needing another grace period? */  	if (!rcu_gp_in_progress() &&  	    rcu_segcblist_is_enabled(&rdp->cblist) && +	    (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || +	     !rcu_segcblist_is_offloaded(&rdp->cblist)) &&  	    !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))  		return 1; @@ -2758,10 +2823,6 @@ static int rcu_pending(void)  	    unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */  		return 1; -	/* Does this CPU need a deferred NOCB wakeup? */ -	if (rcu_nocb_need_deferred_wakeup(rdp)) -		return 1; -  	/* nothing to do */  	return 0;  } @@ -2801,6 +2862,8 @@ static void rcu_barrier_func(void *unused)  	rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);  	rdp->barrier_head.func = rcu_barrier_callback;  	debug_rcu_head_queue(&rdp->barrier_head); +	rcu_nocb_lock(rdp); +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {  		atomic_inc(&rcu_state.barrier_cpu_count);  	} else { @@ -2808,6 +2871,7 @@ static void rcu_barrier_func(void *unused)  		rcu_barrier_trace(TPS("IRQNQ"), -1,  				   rcu_state.barrier_sequence);  	} +	rcu_nocb_unlock(rdp);  }  /** @@ -2858,22 +2922,11 @@ void rcu_barrier(void)  	 * corresponding CPU's preceding callbacks have been invoked.  	 */  	for_each_possible_cpu(cpu) { -		if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) -			continue;  		rdp = per_cpu_ptr(&rcu_data, cpu); -		if (rcu_is_nocb_cpu(cpu)) { -			if (!rcu_nocb_cpu_needs_barrier(cpu)) { -				rcu_barrier_trace(TPS("OfflineNoCB"), cpu, -						   rcu_state.barrier_sequence); -			} else { -				rcu_barrier_trace(TPS("OnlineNoCB"), cpu, -						   rcu_state.barrier_sequence); -				smp_mb__before_atomic(); -				atomic_inc(&rcu_state.barrier_cpu_count); -				__call_rcu(&rdp->barrier_head, -					   rcu_barrier_callback, cpu, 0); -			} -		} else if (rcu_segcblist_n_cbs(&rdp->cblist)) { +		if (!cpu_online(cpu) && +		    !rcu_segcblist_is_offloaded(&rdp->cblist)) +			continue; +		if (rcu_segcblist_n_cbs(&rdp->cblist)) {  			rcu_barrier_trace(TPS("OnlineQ"), cpu,  					   rcu_state.barrier_sequence);  			smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); @@ -2958,7 +3011,8 @@ rcu_boot_init_percpu_data(int cpu)   * Initializes a CPU's per-CPU RCU data.  Note that only one online or   * offline event can be happening at a given time.  Note also that we can   * accept some slop in the rsp->gp_seq access due to the fact that this - * CPU cannot possibly have any RCU callbacks in flight yet. + * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. + * And any offloaded callbacks are being numbered elsewhere.   */  int rcutree_prepare_cpu(unsigned int cpu)  { @@ -2972,7 +3026,7 @@ int rcutree_prepare_cpu(unsigned int cpu)  	rdp->n_force_qs_snap = rcu_state.n_force_qs;  	rdp->blimit = blimit;  	if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ -	    !init_nocb_callback_list(rdp)) +	    !rcu_segcblist_is_offloaded(&rdp->cblist))  		rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */  	rdp->dynticks_nesting = 1;	/* CPU not up, no tearing. */  	rcu_dynticks_eqs_online(); @@ -3151,29 +3205,38 @@ void rcutree_migrate_callbacks(int cpu)  {  	unsigned long flags;  	struct rcu_data *my_rdp; +	struct rcu_node *my_rnp;  	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	struct rcu_node *rnp_root = rcu_get_root();  	bool needwake; -	if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) +	if (rcu_segcblist_is_offloaded(&rdp->cblist) || +	    rcu_segcblist_empty(&rdp->cblist))  		return;  /* No callbacks to migrate. */  	local_irq_save(flags);  	my_rdp = this_cpu_ptr(&rcu_data); -	if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { -		local_irq_restore(flags); -		return; -	} -	raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ +	my_rnp = my_rdp->mynode; +	rcu_nocb_lock(my_rdp); /* irqs already disabled. */ +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); +	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */  	/* Leverage recent GPs and set GP for new callbacks. */ -	needwake = rcu_advance_cbs(rnp_root, rdp) || -		   rcu_advance_cbs(rnp_root, my_rdp); +	needwake = rcu_advance_cbs(my_rnp, rdp) || +		   rcu_advance_cbs(my_rnp, my_rdp);  	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); +	needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); +	rcu_segcblist_disable(&rdp->cblist);  	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=  		     !rcu_segcblist_n_cbs(&my_rdp->cblist)); -	raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); +	if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { +		raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ +		__call_rcu_nocb_wake(my_rdp, true, flags); +	} else { +		rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ +		raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); +	}  	if (needwake)  		rcu_gp_kthread_wake(); +	lockdep_assert_irqs_enabled();  	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||  		  !rcu_segcblist_empty(&rdp->cblist),  		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", @@ -3234,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void)  	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);  	if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))  		return 0; -	rnp = rcu_get_root(); -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	rcu_state.gp_kthread = t;  	if (kthread_prio) {  		sp.sched_priority = kthread_prio;  		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);  	} +	rnp = rcu_get_root(); +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	rcu_state.gp_kthread = t;  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	wake_up_process(t);  	rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7acaf3a62d39..c612f306fe89 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -194,29 +194,38 @@ struct rcu_data {  	/* 5) Callback offloading. */  #ifdef CONFIG_RCU_NOCB_CPU -	struct rcu_head *nocb_head;	/* CBs waiting for kthread. */ -	struct rcu_head **nocb_tail; -	atomic_long_t nocb_q_count;	/* # CBs waiting for nocb */ -	atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */ -	struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ -	struct rcu_head **nocb_follower_tail; -	struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ -	struct task_struct *nocb_kthread; +	struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ +	struct task_struct *nocb_gp_kthread;  	raw_spinlock_t nocb_lock;	/* Guard following pair of fields. */ +	atomic_t nocb_lock_contended;	/* Contention experienced. */  	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */  	struct timer_list nocb_timer;	/* Enforce finite deferral. */ - -	/* The following fields are used by the leader, hence own cacheline. */ -	struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; -					/* CBs waiting for GP. */ -	struct rcu_head **nocb_gp_tail; -	bool nocb_leader_sleep;		/* Is the nocb leader thread asleep? */ -	struct rcu_data *nocb_next_follower; -					/* Next follower in wakeup chain. */ - -	/* The following fields are used by the follower, hence new cachline. */ -	struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; -					/* Leader CPU takes GP-end wakeups. */ +	unsigned long nocb_gp_adv_time;	/* Last call_rcu() CB adv (jiffies). */ + +	/* The following fields are used by call_rcu, hence own cacheline. */ +	raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; +	struct rcu_cblist nocb_bypass;	/* Lock-contention-bypass CB list. */ +	unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */ +	unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */ +	int nocb_nobypass_count;	/* # ->cblist enqueues at ^^^ time. */ + +	/* The following fields are used by GP kthread, hence own cacheline. */ +	raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; +	struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ +	u8 nocb_gp_sleep;		/* Is the nocb GP thread asleep? */ +	u8 nocb_gp_bypass;		/* Found a bypass on last scan? */ +	u8 nocb_gp_gp;			/* GP to wait for on last scan? */ +	unsigned long nocb_gp_seq;	/*  If so, ->gp_seq to wait for. */ +	unsigned long nocb_gp_loops;	/* # passes through wait code. */ +	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ +	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */ +	struct task_struct *nocb_cb_kthread; +	struct rcu_data *nocb_next_cb_rdp; +					/* Next rcu_data in wakeup chain. */ + +	/* The following fields are used by CB kthread, hence new cacheline. */ +	struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; +					/* GP rdp takes GP-end wakeups. */  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */  	/* 6) RCU priority boosting. */ @@ -419,25 +428,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);  static bool rcu_preempt_need_deferred_qs(struct task_struct *t);  static void rcu_preempt_deferred_qs(struct task_struct *t);  static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static bool rcu_nocb_cpu_needs_barrier(int cpu);  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);  static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, -			    bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, -				      struct rcu_data *rdp, -				      unsigned long flags); +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				  unsigned long j); +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				bool *was_alldone, unsigned long flags); +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, +				 unsigned long flags);  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);  static void do_nocb_deferred_wakeup(struct rcu_data *rdp);  static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);  static void rcu_spawn_cpu_nocb_kthread(int cpu);  static void __init rcu_spawn_nocb_kthreads(void); +static void show_rcu_nocb_state(struct rcu_data *rdp); +static void rcu_nocb_lock(struct rcu_data *rdp); +static void rcu_nocb_unlock(struct rcu_data *rdp); +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, +				       unsigned long flags); +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);  #ifdef CONFIG_RCU_NOCB_CPU  static void __init rcu_organize_nocb_kthreads(void); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool init_nocb_callback_list(struct rcu_data *rdp); -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); +#define rcu_nocb_lock_irqsave(rdp, flags)				\ +do {									\ +	if (!rcu_segcblist_is_offloaded(&(rdp)->cblist))		\ +		local_irq_save(flags);					\ +	else								\ +		raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags));	\ +} while (0) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +  static void rcu_bind_gp_kthread(void);  static bool rcu_nohz_full_cpu(void);  static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index af7e7b9c86af..d632cd019597 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)   * other hand, if the CPU is not in an RCU read-side critical section,   * the IPI handler reports the quiescent state immediately.   * - * Although this is a greate improvement over previous expedited + * Although this is a great improvement over previous expedited   * implementations, it is still unfriendly to real-time workloads, so is   * thus not recommended for any sort of common-case code.  In fact, if   * you are using synchronize_rcu_expedited() in a loop, please restructure @@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)   */  void synchronize_rcu_expedited(void)  { +	bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);  	struct rcu_exp_work rew;  	struct rcu_node *rnp;  	unsigned long s; @@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void)  		return;  /* Someone else did our work for us. */  	/* Ensure that load happens before action based on it. */ -	if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { +	if (unlikely(boottime)) {  		/* Direct call during scheduler init and early_initcalls(). */  		rcu_exp_sel_wait_wake(s);  	} else { @@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void)  	/* Let the next expedited grace period start. */  	mutex_unlock(&rcu_state.exp_mutex); + +	if (likely(!boottime)) +		destroy_work_on_stack(&rew.rew_work);  }  EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..2defc7fe74c3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt)  	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);  	struct rcu_node *rnp; -	barrier(); /* Avoid RCU read-side critical sections leaking down. */  	trace_rcu_utilization(TPS("Start context switch"));  	lockdep_assert_irqs_disabled();  	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); @@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt)  				       ? rnp->gp_seq  				       : rcu_seq_snap(&rnp->gp_seq));  		rcu_preempt_ctxt_queue(rnp, rdp); -	} else if (t->rcu_read_lock_nesting < 0 && -		   t->rcu_read_unlock_special.s) { - -		/* -		 * Complete exit from RCU read-side critical section on -		 * behalf of preempted instance of __rcu_read_unlock(). -		 */ -		rcu_read_unlock_special(t); -		rcu_preempt_deferred_qs(t);  	} else {  		rcu_preempt_deferred_qs(t);  	} @@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt)  	if (rdp->exp_deferred_qs)  		rcu_report_exp_rdp(rdp);  	trace_rcu_utilization(TPS("End context switch")); -	barrier(); /* Avoid RCU read-side critical sections leaking up. */  }  EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t)  		      (rdp->grpmask & rnp->expmask) ||  		      tick_nohz_full_cpu(rdp->cpu);  		// Need to defer quiescent state until everything is enabled. -		if ((exp || in_irq()) && irqs_were_disabled && use_softirq && -		    (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { +		if (irqs_were_disabled && use_softirq && +		    (in_interrupt() || +		     (exp && !t->rcu_read_unlock_special.b.deferred_qs))) {  			// Using softirq, safe to awaken, and we get  			// no help from enabling irqs, unlike bh/preempt.  			raise_softirq_irqoff(RCU_SOFTIRQ); -		} else if (exp && irqs_were_disabled && !use_softirq && -			   !t->rcu_read_unlock_special.b.deferred_qs) { -			// Safe to awaken and we get no help from enabling -			// irqs, unlike bh/preempt. -			invoke_rcu_core();  		} else {  			// Enabling BH or preempt does reschedule, so...  			// Also if no expediting or NO_HZ_FULL, slow is OK.  			set_tsk_need_resched(current);  			set_preempt_need_resched(); -			if (IS_ENABLED(CONFIG_IRQ_WORK) && +			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&  			    !rdp->defer_qs_iw_pending && exp) {  				// Get scheduler to re-evaluate and call hooks.  				// If !IRQ_WORK, FQS scan will eventually IPI. @@ -828,11 +813,6 @@ static void rcu_qs(void)   * dyntick-idle quiescent state visible to other CPUs, which will in   * some cases serve for expedited as well as normal grace periods.   * Either way, register a lightweight quiescent state. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. - *   */  void rcu_all_qs(void)  { @@ -847,14 +827,12 @@ void rcu_all_qs(void)  		return;  	}  	this_cpu_write(rcu_data.rcu_urgent_qs, false); -	barrier(); /* Avoid RCU read-side critical sections leaking down. */  	if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {  		local_irq_save(flags);  		rcu_momentary_dyntick_idle();  		local_irq_restore(flags);  	}  	rcu_qs(); -	barrier(); /* Avoid RCU read-side critical sections leaking up. */  	preempt_enable();  }  EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs);   */  void rcu_note_context_switch(bool preempt)  { -	barrier(); /* Avoid RCU read-side critical sections leaking down. */  	trace_rcu_utilization(TPS("Start context switch"));  	rcu_qs();  	/* Load rcu_urgent_qs before other flags. */ @@ -877,7 +854,6 @@ void rcu_note_context_switch(bool preempt)  		rcu_tasks_qs(current);  out:  	trace_rcu_utilization(TPS("End context switch")); -	barrier(); /* Avoid RCU read-side critical sections leaking up. */  }  EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1134,7 +1110,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)   * already exist.  We only create this kthread for preemptible RCU.   * Returns zero if all is well, a negated errno otherwise.   */ -static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)  {  	int rnp_index = rnp - rcu_get_root();  	unsigned long flags; @@ -1142,25 +1118,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)  	struct task_struct *t;  	if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) -		return 0; +		return;  	if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) -		return 0; +		return;  	rcu_state.boost = 1; +  	if (rnp->boost_kthread_task != NULL) -		return 0; +		return; +  	t = kthread_create(rcu_boost_kthread, (void *)rnp,  			   "rcub/%d", rnp_index); -	if (IS_ERR(t)) -		return PTR_ERR(t); +	if (WARN_ON_ONCE(IS_ERR(t))) +		return; +  	raw_spin_lock_irqsave_rcu_node(rnp, flags);  	rnp->boost_kthread_task = t;  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	sp.sched_priority = kthread_prio;  	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);  	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ -	return 0;  }  /* @@ -1201,7 +1179,7 @@ static void __init rcu_spawn_boost_kthreads(void)  	struct rcu_node *rnp;  	rcu_for_each_leaf_node(rnp) -		(void)rcu_spawn_one_boost_kthread(rnp); +		rcu_spawn_one_boost_kthread(rnp);  }  static void rcu_prepare_kthreads(int cpu) @@ -1211,7 +1189,7 @@ static void rcu_prepare_kthreads(int cpu)  	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */  	if (rcu_scheduler_fully_active) -		(void)rcu_spawn_one_boost_kthread(rnp); +		rcu_spawn_one_boost_kthread(rnp);  }  #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1248,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu)  #if !defined(CONFIG_RCU_FAST_NO_HZ)  /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so.  This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so.  This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API.   *   * Because we not have RCU_FAST_NO_HZ, just check whether or not this   * CPU has RCU callbacks queued. @@ -1259,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu)  int rcu_needs_cpu(u64 basemono, u64 *nextevt)  {  	*nextevt = KTIME_MAX; -	return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); +	return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && +	       !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);  }  /* @@ -1360,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)  	lockdep_assert_irqs_disabled(); -	/* If no callbacks, RCU doesn't need the CPU. */ -	if (rcu_segcblist_empty(&rdp->cblist)) { +	/* If no non-offloaded callbacks, RCU doesn't need the CPU. */ +	if (rcu_segcblist_empty(&rdp->cblist) || +	    rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {  		*nextevt = KTIME_MAX;  		return 0;  	} @@ -1404,7 +1384,7 @@ static void rcu_prepare_for_idle(void)  	int tne;  	lockdep_assert_irqs_disabled(); -	if (rcu_is_nocb_cpu(smp_processor_id())) +	if (rcu_segcblist_is_offloaded(&rdp->cblist))  		return;  	/* Handle nohz enablement switches conservatively. */ @@ -1453,8 +1433,10 @@ static void rcu_prepare_for_idle(void)   */  static void rcu_cleanup_after_idle(void)  { +	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); +  	lockdep_assert_irqs_disabled(); -	if (rcu_is_nocb_cpu(smp_processor_id())) +	if (rcu_segcblist_is_offloaded(&rdp->cblist))  		return;  	if (rcu_try_advance_all_cbs())  		invoke_rcu_core(); @@ -1469,10 +1451,10 @@ static void rcu_cleanup_after_idle(void)   * specified by rcu_nocb_mask.  For the CPUs in the set, there are kthreads   * created that pull the callbacks from the corresponding CPU, wait for   * a grace period to elapse, and invoke the callbacks.  These kthreads - * are organized into leaders, which manage incoming callbacks, wait for - * grace periods, and awaken followers, and the followers, which only - * invoke callbacks.  Each leader is its own follower.  The no-CBs CPUs - * do a wake_up() on their kthread when they insert a callback into any + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks.  Each GP kthread invokes its own CBs.  The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any   * empty list, unless the rcu_nocb_poll boot parameter has been specified,   * in which case each kthread actively polls its CPU.  (Which isn't so great   * for energy efficiency, but which does reduce RCU's overhead on that CPU.) @@ -1515,6 +1497,116 @@ static int __init parse_rcu_nocb_poll(char *arg)  early_param("rcu_nocb_poll", parse_rcu_nocb_poll);  /* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock.  If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) +{ +	lockdep_assert_irqs_disabled(); +	if (raw_spin_trylock(&rdp->nocb_bypass_lock)) +		return; +	atomic_inc(&rdp->nocb_lock_contended); +	WARN_ON_ONCE(smp_processor_id() != rdp->cpu); +	smp_mb__after_atomic(); /* atomic_inc() before lock. */ +	raw_spin_lock(&rdp->nocb_bypass_lock); +	smp_mb__before_atomic(); /* atomic_dec() after lock. */ +	atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended.  Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock.  This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations.  Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ +	WARN_ON_ONCE(smp_processor_id() != rdp->cpu); +	while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) +		cpu_relax(); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ +	lockdep_assert_irqs_disabled(); +	return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) +{ +	lockdep_assert_irqs_disabled(); +	raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +	lockdep_assert_irqs_disabled(); +	if (!rcu_segcblist_is_offloaded(&rdp->cblist)) +		return; +	raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ +	if (rcu_segcblist_is_offloaded(&rdp->cblist)) { +		lockdep_assert_irqs_disabled(); +		raw_spin_unlock(&rdp->nocb_lock); +	} +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, +				       unsigned long flags) +{ +	if (rcu_segcblist_is_offloaded(&rdp->cblist)) { +		lockdep_assert_irqs_disabled(); +		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +	} else { +		local_irq_restore(flags); +	} +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ +	lockdep_assert_irqs_disabled(); +	if (rcu_segcblist_is_offloaded(&rdp->cblist) && +	    cpu_online(rdp->cpu)) +		lockdep_assert_held(&rdp->nocb_lock); +} + +/*   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended   * grace period.   */ @@ -1543,440 +1635,514 @@ bool rcu_is_nocb_cpu(int cpu)  }  /* - * Kick the leader kthread for this NOCB group.  Caller holds ->nocb_lock + * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock   * and this function releases it.   */ -static void __wake_nocb_leader(struct rcu_data *rdp, bool force, -			       unsigned long flags) +static void wake_nocb_gp(struct rcu_data *rdp, bool force, +			   unsigned long flags)  	__releases(rdp->nocb_lock)  { -	struct rcu_data *rdp_leader = rdp->nocb_leader; +	bool needwake = false; +	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;  	lockdep_assert_held(&rdp->nocb_lock); -	if (!READ_ONCE(rdp_leader->nocb_kthread)) { -		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +	if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { +		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +				    TPS("AlreadyAwake")); +		rcu_nocb_unlock_irqrestore(rdp, flags);  		return;  	} -	if (rdp_leader->nocb_leader_sleep || force) { -		/* Prior smp_mb__after_atomic() orders against prior enqueue. */ -		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); -		del_timer(&rdp->nocb_timer); -		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); -		smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ -		swake_up_one(&rdp_leader->nocb_wq); -	} else { -		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +	del_timer(&rdp->nocb_timer); +	rcu_nocb_unlock_irqrestore(rdp, flags); +	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); +	if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { +		WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); +		needwake = true; +		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));  	} +	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); +	if (needwake) +		wake_up_process(rdp_gp->nocb_gp_kthread);  }  /* - * Kick the leader kthread for this NOCB group, but caller has not - * acquired locks. + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so.   */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, +			       const char *reason)  { -	unsigned long flags; +	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) +		mod_timer(&rdp->nocb_timer, jiffies + 1); +	if (rdp->nocb_defer_wakeup < waketype) +		WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); +	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				     unsigned long j) +{ +	struct rcu_cblist rcl; -	raw_spin_lock_irqsave(&rdp->nocb_lock, flags); -	__wake_nocb_leader(rdp, force, flags); +	WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); +	rcu_lockdep_assert_cblist_protected(rdp); +	lockdep_assert_held(&rdp->nocb_bypass_lock); +	if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { +		raw_spin_unlock(&rdp->nocb_bypass_lock); +		return false; +	} +	/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ +	if (rhp) +		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ +	rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); +	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); +	WRITE_ONCE(rdp->nocb_bypass_first, j); +	rcu_nocb_bypass_unlock(rdp); +	return true;  }  /* - * Arrange to wake the leader kthread for this NOCB group at some - * future time when it is safe to do so. + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL.   */ -static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, -				   const char *reason) +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				  unsigned long j)  { -	unsigned long flags; +	if (!rcu_segcblist_is_offloaded(&rdp->cblist)) +		return true; +	rcu_lockdep_assert_cblist_protected(rdp); +	rcu_nocb_bypass_lock(rdp); +	return rcu_nocb_do_flush_bypass(rdp, rhp, j); +} -	raw_spin_lock_irqsave(&rdp->nocb_lock, flags); -	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) -		mod_timer(&rdp->nocb_timer, jiffies + 1); -	WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); -	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); -	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ +	rcu_lockdep_assert_cblist_protected(rdp); +	if (!rcu_segcblist_is_offloaded(&rdp->cblist) || +	    !rcu_nocb_bypass_trylock(rdp)) +		return; +	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));  } -/* Does rcu_barrier need to queue an RCU callback on the specified CPU?  */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock.  A limited number of direct + * enqueues are permitted into ->cblist per jiffy.  If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks.  Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist.  However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu().  The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code.  Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				bool *was_alldone, unsigned long flags)  { -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	unsigned long ret; -#ifdef CONFIG_PROVE_RCU -	struct rcu_head *rhp; -#endif /* #ifdef CONFIG_PROVE_RCU */ +	unsigned long c; +	unsigned long cur_gp_seq; +	unsigned long j = jiffies; +	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); -	/* -	 * Check count of all no-CBs callbacks awaiting invocation. -	 * There needs to be a barrier before this function is called, -	 * but associated with a prior determination that no more -	 * callbacks would be posted.  In the worst case, the first -	 * barrier in rcu_barrier() suffices (but the caller cannot -	 * necessarily rely on this, not a substitute for the caller -	 * getting the concurrency design right!).  There must also be a -	 * barrier between the following load and posting of a callback -	 * (if a callback is in fact needed).  This is associated with an -	 * atomic_inc() in the caller. -	 */ -	ret = rcu_get_n_cbs_nocb_cpu(rdp); - -#ifdef CONFIG_PROVE_RCU -	rhp = READ_ONCE(rdp->nocb_head); -	if (!rhp) -		rhp = READ_ONCE(rdp->nocb_gp_head); -	if (!rhp) -		rhp = READ_ONCE(rdp->nocb_follower_head); - -	/* Having no rcuo kthread but CBs after scheduler starts is bad! */ -	if (!READ_ONCE(rdp->nocb_kthread) && rhp && -	    rcu_scheduler_fully_active) { -		/* RCU callback enqueued before CPU first came online??? */ -		pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", -		       cpu, rhp->func); -		WARN_ON_ONCE(1); +	if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { +		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); +		return false; /* Not offloaded, no bypassing. */ +	} +	lockdep_assert_irqs_disabled(); + +	// Don't use ->nocb_bypass during early boot. +	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { +		rcu_nocb_lock(rdp); +		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); +		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); +		return false; +	} + +	// If we have advanced to a new jiffy, reset counts to allow +	// moving back from ->nocb_bypass to ->cblist. +	if (j == rdp->nocb_nobypass_last) { +		c = rdp->nocb_nobypass_count + 1; +	} else { +		WRITE_ONCE(rdp->nocb_nobypass_last, j); +		c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; +		if (ULONG_CMP_LT(rdp->nocb_nobypass_count, +				 nocb_nobypass_lim_per_jiffy)) +			c = 0; +		else if (c > nocb_nobypass_lim_per_jiffy) +			c = nocb_nobypass_lim_per_jiffy; +	} +	WRITE_ONCE(rdp->nocb_nobypass_count, c); + +	// If there hasn't yet been all that many ->cblist enqueues +	// this jiffy, tell the caller to enqueue onto ->cblist.  But flush +	// ->nocb_bypass first. +	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { +		rcu_nocb_lock(rdp); +		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); +		if (*was_alldone) +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +					    TPS("FirstQ")); +		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); +		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); +		return false; // Caller must enqueue the callback. +	} + +	// If ->nocb_bypass has been used too long or is too full, +	// flush ->nocb_bypass to ->cblist. +	if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || +	    ncbs >= qhimark) { +		rcu_nocb_lock(rdp); +		if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { +			*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); +			if (*was_alldone) +				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +						    TPS("FirstQ")); +			WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); +			return false; // Caller must enqueue the callback. +		} +		if (j != rdp->nocb_gp_adv_time && +		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && +		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { +			rcu_advance_cbs_nowake(rdp->mynode, rdp); +			rdp->nocb_gp_adv_time = j; +		} +		rcu_nocb_unlock_irqrestore(rdp, flags); +		return true; // Callback already enqueued.  	} -#endif /* #ifdef CONFIG_PROVE_RCU */ -	return !!ret; +	// We need to use the bypass. +	rcu_nocb_wait_contended(rdp); +	rcu_nocb_bypass_lock(rdp); +	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); +	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ +	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); +	if (!ncbs) { +		WRITE_ONCE(rdp->nocb_bypass_first, j); +		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); +	} +	rcu_nocb_bypass_unlock(rdp); +	smp_mb(); /* Order enqueue before wake. */ +	if (ncbs) { +		local_irq_restore(flags); +	} else { +		// No-CBs GP kthread might be indefinitely asleep, if so, wake. +		rcu_nocb_lock(rdp); // Rare during call_rcu() flood. +		if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +					    TPS("FirstBQwake")); +			__call_rcu_nocb_wake(rdp, true, flags); +		} else { +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +					    TPS("FirstBQnoWake")); +			rcu_nocb_unlock_irqrestore(rdp, flags); +		} +	} +	return true; // Callback already enqueued.  }  /* - * Enqueue the specified string of rcu_head structures onto the specified - * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the - * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy - * counts are supplied by rhcount and rhcount_lazy. + * Awaken the no-CBs grace-period kthead if needed, either due to it + * legitimately being asleep or due to overload conditions.   *   * If warranted, also wake up the kthread servicing this CPUs queues.   */ -static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, -				    struct rcu_head *rhp, -				    struct rcu_head **rhtp, -				    int rhcount, int rhcount_lazy, -				    unsigned long flags) +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, +				 unsigned long flags) +				 __releases(rdp->nocb_lock)  { -	int len; -	struct rcu_head **old_rhpp; +	unsigned long cur_gp_seq; +	unsigned long j; +	long len;  	struct task_struct *t; -	/* Enqueue the callback on the nocb list and update counts. */ -	atomic_long_add(rhcount, &rdp->nocb_q_count); -	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */ -	old_rhpp = xchg(&rdp->nocb_tail, rhtp); -	WRITE_ONCE(*old_rhpp, rhp); -	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); -	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ - -	/* If we are not being polled and there is a kthread, awaken it ... */ -	t = READ_ONCE(rdp->nocb_kthread); +	// If we are being polled or there is no kthread, just leave. +	t = READ_ONCE(rdp->nocb_gp_kthread);  	if (rcu_nocb_poll || !t) {  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,  				    TPS("WakeNotPoll")); +		rcu_nocb_unlock_irqrestore(rdp, flags);  		return;  	} -	len = rcu_get_n_cbs_nocb_cpu(rdp); -	if (old_rhpp == &rdp->nocb_head) { +	// Need to actually to a wakeup. +	len = rcu_segcblist_n_cbs(&rdp->cblist); +	if (was_alldone) { +		rdp->qlen_last_fqs_check = len;  		if (!irqs_disabled_flags(flags)) {  			/* ... if queue was empty ... */ -			wake_nocb_leader(rdp, false); +			wake_nocb_gp(rdp, false, flags);  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,  					    TPS("WakeEmpty"));  		} else { -			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, -					       TPS("WakeEmptyIsDeferred")); +			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, +					   TPS("WakeEmptyIsDeferred")); +			rcu_nocb_unlock_irqrestore(rdp, flags);  		} -		rdp->qlen_last_fqs_check = 0;  	} else if (len > rdp->qlen_last_fqs_check + qhimark) {  		/* ... or if many callbacks queued. */ -		if (!irqs_disabled_flags(flags)) { -			wake_nocb_leader(rdp, true); -			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, -					    TPS("WakeOvf")); -		} else { -			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, -					       TPS("WakeOvfIsDeferred")); +		rdp->qlen_last_fqs_check = len; +		j = jiffies; +		if (j != rdp->nocb_gp_adv_time && +		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && +		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { +			rcu_advance_cbs_nowake(rdp->mynode, rdp); +			rdp->nocb_gp_adv_time = j;  		} -		rdp->qlen_last_fqs_check = LONG_MAX / 2; +		smp_mb(); /* Enqueue before timer_pending(). */ +		if ((rdp->nocb_cb_sleep || +		     !rcu_segcblist_ready_cbs(&rdp->cblist)) && +		    !timer_pending(&rdp->nocb_bypass_timer)) +			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, +					   TPS("WakeOvfIsDeferred")); +		rcu_nocb_unlock_irqrestore(rdp, flags);  	} else {  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); +		rcu_nocb_unlock_irqrestore(rdp, flags);  	}  	return;  } -/* - * This is a helper for __call_rcu(), which invokes this when the normal - * callback queue is inoperable.  If this is not a no-CBs CPU, this - * function returns failure back to __call_rcu(), which can complain - * appropriately. - * - * Otherwise, this function queues the callback where the corresponding - * "rcuo" kthread can find it. - */ -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, -			    bool lazy, unsigned long flags) +/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */ +static void do_nocb_bypass_wakeup_timer(struct timer_list *t)  { +	unsigned long flags; +	struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); -	if (!rcu_is_nocb_cpu(rdp->cpu)) -		return false; -	__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); -	if (__is_kfree_rcu_offset((unsigned long)rhp->func)) -		trace_rcu_kfree_callback(rcu_state.name, rhp, -					 (unsigned long)rhp->func, -					 -atomic_long_read(&rdp->nocb_q_count_lazy), -					 -rcu_get_n_cbs_nocb_cpu(rdp)); -	else -		trace_rcu_callback(rcu_state.name, rhp, -				   -atomic_long_read(&rdp->nocb_q_count_lazy), -				   -rcu_get_n_cbs_nocb_cpu(rdp)); - -	/* -	 * If called from an extended quiescent state with interrupts -	 * disabled, invoke the RCU core in order to allow the idle-entry -	 * deferred-wakeup check to function. -	 */ -	if (irqs_disabled_flags(flags) && -	    !rcu_is_watching() && -	    cpu_online(smp_processor_id())) -		invoke_rcu_core(); - -	return true; -} - -/* - * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is - * not a no-CBs CPU. - */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, -						     struct rcu_data *rdp, -						     unsigned long flags) -{ -	lockdep_assert_irqs_disabled(); -	if (!rcu_is_nocb_cpu(smp_processor_id())) -		return false; /* Not NOCBs CPU, caller must migrate CBs. */ -	__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), -				rcu_segcblist_tail(&rdp->cblist), -				rcu_segcblist_n_cbs(&rdp->cblist), -				rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); -	rcu_segcblist_init(&rdp->cblist); -	rcu_segcblist_disable(&rdp->cblist); -	return true; +	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); +	rcu_nocb_lock_irqsave(rdp, flags); +	smp_mb__after_spinlock(); /* Timer expire before wakeup. */ +	__call_rcu_nocb_wake(rdp, true, flags);  }  /* - * If necessary, kick off a new grace period, and either way wait - * for a subsequent grace period to complete. + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end.   */ -static void rcu_nocb_wait_gp(struct rcu_data *rdp) +static void nocb_gp_wait(struct rcu_data *my_rdp)  { -	unsigned long c; -	bool d; +	bool bypass = false; +	long bypass_ncbs; +	int __maybe_unused cpu = my_rdp->cpu; +	unsigned long cur_gp_seq;  	unsigned long flags; +	bool gotcbs; +	unsigned long j = jiffies; +	bool needwait_gp = false; // This prevents actual uninitialized use.  	bool needwake; -	struct rcu_node *rnp = rdp->mynode; +	bool needwake_gp; +	struct rcu_data *rdp; +	struct rcu_node *rnp; +	unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. -	local_irq_save(flags); -	c = rcu_seq_snap(&rcu_state.gp_seq); -	if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { -		local_irq_restore(flags); -	} else { -		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ -		needwake = rcu_start_this_gp(rnp, rdp, c); -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		if (needwake) +	/* +	 * Each pass through the following loop checks for CBs and for the +	 * nearest grace period (if any) to wait for next.  The CB kthreads +	 * and the global grace-period kthread are awakened if needed. +	 */ +	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { +		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); +		rcu_nocb_lock_irqsave(rdp, flags); +		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); +		if (bypass_ncbs && +		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || +		     bypass_ncbs > 2 * qhimark)) { +			// Bypass full or old, so flush it. +			(void)rcu_nocb_try_flush_bypass(rdp, j); +			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); +		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { +			rcu_nocb_unlock_irqrestore(rdp, flags); +			continue; /* No callbacks here, try next. */ +		} +		if (bypass_ncbs) { +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +					    TPS("Bypass")); +			bypass = true; +		} +		rnp = rdp->mynode; +		if (bypass) {  // Avoid race with first bypass CB. +			WRITE_ONCE(my_rdp->nocb_defer_wakeup, +				   RCU_NOCB_WAKE_NOT); +			del_timer(&my_rdp->nocb_timer); +		} +		// Advance callbacks if helpful and low contention. +		needwake_gp = false; +		if (!rcu_segcblist_restempty(&rdp->cblist, +					     RCU_NEXT_READY_TAIL) || +		    (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && +		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { +			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ +			needwake_gp = rcu_advance_cbs(rnp, rdp); +			raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ +		} +		// Need to wait on some grace period? +		WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, +						      RCU_NEXT_READY_TAIL)); +		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { +			if (!needwait_gp || +			    ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) +				wait_gp_seq = cur_gp_seq; +			needwait_gp = true; +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, +					    TPS("NeedWaitGP")); +		} +		if (rcu_segcblist_ready_cbs(&rdp->cblist)) { +			needwake = rdp->nocb_cb_sleep; +			WRITE_ONCE(rdp->nocb_cb_sleep, false); +			smp_mb(); /* CB invocation -after- GP end. */ +		} else { +			needwake = false; +		} +		rcu_nocb_unlock_irqrestore(rdp, flags); +		if (needwake) { +			swake_up_one(&rdp->nocb_cb_wq); +			gotcbs = true; +		} +		if (needwake_gp)  			rcu_gp_kthread_wake();  	} -	/* -	 * Wait for the grace period.  Do so interruptibly to avoid messing -	 * up the load average. -	 */ -	trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); -	for (;;) { +	my_rdp->nocb_gp_bypass = bypass; +	my_rdp->nocb_gp_gp = needwait_gp; +	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; +	if (bypass && !rcu_nocb_poll) { +		// At least one child with non-empty ->nocb_bypass, so set +		// timer in order to avoid stranding its callbacks. +		raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); +		mod_timer(&my_rdp->nocb_bypass_timer, j + 2); +		raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); +	} +	if (rcu_nocb_poll) { +		/* Polling, so trace if first poll in the series. */ +		if (gotcbs) +			trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); +		schedule_timeout_interruptible(1); +	} else if (!needwait_gp) { +		/* Wait for callbacks to appear. */ +		trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); +		swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, +				!READ_ONCE(my_rdp->nocb_gp_sleep)); +		trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); +	} else { +		rnp = my_rdp->mynode; +		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));  		swait_event_interruptible_exclusive( -			rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], -			(d = rcu_seq_done(&rnp->gp_seq, c))); -		if (likely(d)) -			break; -		WARN_ON(signal_pending(current)); -		trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); +			rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], +			rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || +			!READ_ONCE(my_rdp->nocb_gp_sleep)); +		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));  	} -	trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); -	smp_mb(); /* Ensure that CB invocation happens after GP end. */ +	if (!rcu_nocb_poll) { +		raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); +		if (bypass) +			del_timer(&my_rdp->nocb_bypass_timer); +		WRITE_ONCE(my_rdp->nocb_gp_sleep, true); +		raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); +	} +	my_rdp->nocb_gp_seq = -1; +	WARN_ON(signal_pending(current));  }  /* - * Leaders come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. + * No-CBs grace-period-wait kthread.  There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot.  This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do.   */ -static void nocb_leader_wait(struct rcu_data *my_rdp) +static int rcu_nocb_gp_kthread(void *arg)  { -	bool firsttime = true; -	unsigned long flags; -	bool gotcbs; -	struct rcu_data *rdp; -	struct rcu_head **tail; - -wait_again: - -	/* Wait for callbacks to appear. */ -	if (!rcu_nocb_poll) { -		trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); -		swait_event_interruptible_exclusive(my_rdp->nocb_wq, -				!READ_ONCE(my_rdp->nocb_leader_sleep)); -		raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); -		my_rdp->nocb_leader_sleep = true; -		WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); -		del_timer(&my_rdp->nocb_timer); -		raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); -	} else if (firsttime) { -		firsttime = false; /* Don't drown trace log with "Poll"! */ -		trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); -	} - -	/* -	 * Each pass through the following loop checks a follower for CBs. -	 * We are our own first follower.  Any CBs found are moved to -	 * nocb_gp_head, where they await a grace period. -	 */ -	gotcbs = false; -	smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ -	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { -		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); -		if (!rdp->nocb_gp_head) -			continue;  /* No CBs here, try next follower. */ - -		/* Move callbacks to wait-for-GP list, which is empty. */ -		WRITE_ONCE(rdp->nocb_head, NULL); -		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); -		gotcbs = true; -	} - -	/* No callbacks?  Sleep a bit if polling, and go retry.  */ -	if (unlikely(!gotcbs)) { -		WARN_ON(signal_pending(current)); -		if (rcu_nocb_poll) { -			schedule_timeout_interruptible(1); -		} else { -			trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, -					    TPS("WokeEmpty")); -		} -		goto wait_again; -	} +	struct rcu_data *rdp = arg; -	/* Wait for one grace period. */ -	rcu_nocb_wait_gp(my_rdp); - -	/* Each pass through the following loop wakes a follower, if needed. */ -	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { -		if (!rcu_nocb_poll && -		    READ_ONCE(rdp->nocb_head) && -		    READ_ONCE(my_rdp->nocb_leader_sleep)) { -			raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); -			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ -			raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); -		} -		if (!rdp->nocb_gp_head) -			continue; /* No CBs, so no need to wake follower. */ - -		/* Append callbacks to follower's "done" list. */ -		raw_spin_lock_irqsave(&rdp->nocb_lock, flags); -		tail = rdp->nocb_follower_tail; -		rdp->nocb_follower_tail = rdp->nocb_gp_tail; -		*tail = rdp->nocb_gp_head; -		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); -		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { -			/* List was empty, so wake up the follower.  */ -			swake_up_one(&rdp->nocb_wq); -		} +	for (;;) { +		WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); +		nocb_gp_wait(rdp); +		cond_resched_tasks_rcu_qs();  	} - -	/* If we (the leader) don't have CBs, go wait some more. */ -	if (!my_rdp->nocb_follower_head) -		goto wait_again; +	return 0;  }  /* - * Followers come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear.   */ -static void nocb_follower_wait(struct rcu_data *rdp) +static void nocb_cb_wait(struct rcu_data *rdp)  { -	for (;;) { -		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); -		swait_event_interruptible_exclusive(rdp->nocb_wq, -					 READ_ONCE(rdp->nocb_follower_head)); -		if (smp_load_acquire(&rdp->nocb_follower_head)) { -			/* ^^^ Ensure CB invocation follows _head test. */ -			return; -		} -		WARN_ON(signal_pending(current)); -		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); +	unsigned long cur_gp_seq; +	unsigned long flags; +	bool needwake_gp = false; +	struct rcu_node *rnp = rdp->mynode; + +	local_irq_save(flags); +	rcu_momentary_dyntick_idle(); +	local_irq_restore(flags); +	local_bh_disable(); +	rcu_do_batch(rdp); +	local_bh_enable(); +	lockdep_assert_irqs_enabled(); +	rcu_nocb_lock_irqsave(rdp, flags); +	if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && +	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && +	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ +		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); +		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ +	} +	if (rcu_segcblist_ready_cbs(&rdp->cblist)) { +		rcu_nocb_unlock_irqrestore(rdp, flags); +		if (needwake_gp) +			rcu_gp_kthread_wake(); +		return; +	} + +	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); +	WRITE_ONCE(rdp->nocb_cb_sleep, true); +	rcu_nocb_unlock_irqrestore(rdp, flags); +	if (needwake_gp) +		rcu_gp_kthread_wake(); +	swait_event_interruptible_exclusive(rdp->nocb_cb_wq, +				 !READ_ONCE(rdp->nocb_cb_sleep)); +	if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ +		/* ^^^ Ensure CB invocation follows _sleep test. */ +		return;  	} +	WARN_ON(signal_pending(current)); +	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));  }  /* - * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU, however, there is - * an optional leader-follower relationship so that the grace-period - * kthreads don't have to do quite so many wakeups. + * Per-rcu_data kthread, but only for no-CBs CPUs.  Repeatedly invoke + * nocb_cb_wait() to do the dirty work.   */ -static int rcu_nocb_kthread(void *arg) +static int rcu_nocb_cb_kthread(void *arg)  { -	int c, cl; -	unsigned long flags; -	struct rcu_head *list; -	struct rcu_head *next; -	struct rcu_head **tail;  	struct rcu_data *rdp = arg; -	/* Each pass through this loop invokes one batch of callbacks */ +	// Each pass through this loop does one callback batch, and, +	// if there are no more ready callbacks, waits for them.  	for (;;) { -		/* Wait for callbacks. */ -		if (rdp->nocb_leader == rdp) -			nocb_leader_wait(rdp); -		else -			nocb_follower_wait(rdp); - -		/* Pull the ready-to-invoke callbacks onto local list. */ -		raw_spin_lock_irqsave(&rdp->nocb_lock, flags); -		list = rdp->nocb_follower_head; -		rdp->nocb_follower_head = NULL; -		tail = rdp->nocb_follower_tail; -		rdp->nocb_follower_tail = &rdp->nocb_follower_head; -		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); -		if (WARN_ON_ONCE(!list)) -			continue; -		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); - -		/* Each pass through the following loop invokes a callback. */ -		trace_rcu_batch_start(rcu_state.name, -				      atomic_long_read(&rdp->nocb_q_count_lazy), -				      rcu_get_n_cbs_nocb_cpu(rdp), -1); -		c = cl = 0; -		while (list) { -			next = list->next; -			/* Wait for enqueuing to complete, if needed. */ -			while (next == NULL && &list->next != tail) { -				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, -						    TPS("WaitQueue")); -				schedule_timeout_interruptible(1); -				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, -						    TPS("WokeQueue")); -				next = list->next; -			} -			debug_rcu_head_unqueue(list); -			local_bh_disable(); -			if (__rcu_reclaim(rcu_state.name, list)) -				cl++; -			c++; -			local_bh_enable(); -			cond_resched_tasks_rcu_qs(); -			list = next; -		} -		trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); -		smp_mb__before_atomic();  /* _add after CB invocation. */ -		atomic_long_add(-c, &rdp->nocb_q_count); -		atomic_long_add(-cl, &rdp->nocb_q_count_lazy); +		nocb_cb_wait(rdp); +		cond_resched_tasks_rcu_qs();  	}  	return 0;  } @@ -1993,14 +2159,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)  	unsigned long flags;  	int ndw; -	raw_spin_lock_irqsave(&rdp->nocb_lock, flags); +	rcu_nocb_lock_irqsave(rdp, flags);  	if (!rcu_nocb_need_deferred_wakeup(rdp)) { -		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +		rcu_nocb_unlock_irqrestore(rdp, flags);  		return;  	}  	ndw = READ_ONCE(rdp->nocb_defer_wakeup);  	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); -	__wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); +	wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);  	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));  } @@ -2027,6 +2193,7 @@ void __init rcu_init_nohz(void)  {  	int cpu;  	bool need_rcu_nocb_mask = false; +	struct rcu_data *rdp;  #if defined(CONFIG_NO_HZ_FULL)  	if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) @@ -2060,67 +2227,63 @@ void __init rcu_init_nohz(void)  	if (rcu_nocb_poll)  		pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); -	for_each_cpu(cpu, rcu_nocb_mask) -		init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); +	for_each_cpu(cpu, rcu_nocb_mask) { +		rdp = per_cpu_ptr(&rcu_data, cpu); +		if (rcu_segcblist_empty(&rdp->cblist)) +			rcu_segcblist_init(&rdp->cblist); +		rcu_segcblist_offload(&rdp->cblist); +	}  	rcu_organize_nocb_kthreads();  }  /* Initialize per-rcu_data variables for no-CBs CPUs. */  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)  { -	rdp->nocb_tail = &rdp->nocb_head; -	init_swait_queue_head(&rdp->nocb_wq); -	rdp->nocb_follower_tail = &rdp->nocb_follower_head; +	init_swait_queue_head(&rdp->nocb_cb_wq); +	init_swait_queue_head(&rdp->nocb_gp_wq);  	raw_spin_lock_init(&rdp->nocb_lock); +	raw_spin_lock_init(&rdp->nocb_bypass_lock); +	raw_spin_lock_init(&rdp->nocb_gp_lock);  	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); +	timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0); +	rcu_cblist_init(&rdp->nocb_bypass);  }  /*   * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it.  If the CPUs are brought online out of order, - * this can require re-organizing the leader-follower relationships. + * rcuo CB kthread, spawn it.  Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well.   */  static void rcu_spawn_one_nocb_kthread(int cpu)  { -	struct rcu_data *rdp; -	struct rcu_data *rdp_last; -	struct rcu_data *rdp_old_leader; -	struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); +	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); +	struct rcu_data *rdp_gp;  	struct task_struct *t;  	/*  	 * If this isn't a no-CBs CPU or if it already has an rcuo kthread,  	 * then nothing to do.  	 */ -	if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) +	if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)  		return; -	/* If we didn't spawn the leader first, reorganize! */ -	rdp_old_leader = rdp_spawn->nocb_leader; -	if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { -		rdp_last = NULL; -		rdp = rdp_old_leader; -		do { -			rdp->nocb_leader = rdp_spawn; -			if (rdp_last && rdp != rdp_spawn) -				rdp_last->nocb_next_follower = rdp; -			if (rdp == rdp_spawn) { -				rdp = rdp->nocb_next_follower; -			} else { -				rdp_last = rdp; -				rdp = rdp->nocb_next_follower; -				rdp_last->nocb_next_follower = NULL; -			} -		} while (rdp); -		rdp_spawn->nocb_next_follower = rdp_old_leader; +	/* If we didn't spawn the GP kthread first, reorganize! */ +	rdp_gp = rdp->nocb_gp_rdp; +	if (!rdp_gp->nocb_gp_kthread) { +		t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, +				"rcuog/%d", rdp_gp->cpu); +		if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) +			return; +		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);  	}  	/* Spawn the kthread for this CPU. */ -	t = kthread_run(rcu_nocb_kthread, rdp_spawn, +	t = kthread_run(rcu_nocb_cb_kthread, rdp,  			"rcuo%c/%d", rcu_state.abbr, cpu); -	if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) +	if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))  		return; -	WRITE_ONCE(rdp_spawn->nocb_kthread, t); +	WRITE_ONCE(rdp->nocb_cb_kthread, t); +	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);  }  /* @@ -2147,27 +2310,28 @@ static void __init rcu_spawn_nocb_kthreads(void)  		rcu_spawn_cpu_nocb_kthread(cpu);  } -/* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_leader_stride = -1; -module_param(rcu_nocb_leader_stride, int, 0444); +/* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444);  /* - * Initialize leader-follower relationships for all no-CBs CPU. + * Initialize GP-CB relationships for all no-CBs CPU.   */  static void __init rcu_organize_nocb_kthreads(void)  {  	int cpu; -	int ls = rcu_nocb_leader_stride; -	int nl = 0;  /* Next leader. */ +	bool firsttime = true; +	int ls = rcu_nocb_gp_stride; +	int nl = 0;  /* Next GP kthread. */  	struct rcu_data *rdp; -	struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */ +	struct rcu_data *rdp_gp = NULL;  /* Suppress misguided gcc warn. */  	struct rcu_data *rdp_prev = NULL;  	if (!cpumask_available(rcu_nocb_mask))  		return;  	if (ls == -1) { -		ls = int_sqrt(nr_cpu_ids); -		rcu_nocb_leader_stride = ls; +		ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); +		rcu_nocb_gp_stride = ls;  	}  	/* @@ -2178,39 +2342,24 @@ static void __init rcu_organize_nocb_kthreads(void)  	for_each_cpu(cpu, rcu_nocb_mask) {  		rdp = per_cpu_ptr(&rcu_data, cpu);  		if (rdp->cpu >= nl) { -			/* New leader, set up for followers & next leader. */ +			/* New GP kthread, set up for CBs & next GP. */  			nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; -			rdp->nocb_leader = rdp; -			rdp_leader = rdp; +			rdp->nocb_gp_rdp = rdp; +			rdp_gp = rdp; +			if (!firsttime && dump_tree) +				pr_cont("\n"); +			firsttime = false; +			pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);  		} else { -			/* Another follower, link to previous leader. */ -			rdp->nocb_leader = rdp_leader; -			rdp_prev->nocb_next_follower = rdp; +			/* Another CB kthread, link to previous GP kthread. */ +			rdp->nocb_gp_rdp = rdp_gp; +			rdp_prev->nocb_next_cb_rdp = rdp; +			pr_alert(" %d", cpu);  		}  		rdp_prev = rdp;  	}  } -/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ -	if (!rcu_is_nocb_cpu(rdp->cpu)) -		return false; - -	/* If there are early-boot callbacks, move them to nocb lists. */ -	if (!rcu_segcblist_empty(&rdp->cblist)) { -		rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); -		rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); -		atomic_long_set(&rdp->nocb_q_count, -				rcu_segcblist_n_cbs(&rdp->cblist)); -		atomic_long_set(&rdp->nocb_q_count_lazy, -				rcu_segcblist_n_lazy_cbs(&rdp->cblist)); -		rcu_segcblist_init(&rdp->cblist); -	} -	rcu_segcblist_disable(&rdp->cblist); -	return true; -} -  /*   * Bind the current task to the offloaded CPUs.  If there are no offloaded   * CPUs, leave the task unbound.  Splat if the bind attempt fails. @@ -2223,20 +2372,101 @@ void rcu_bind_current_to_nocb(void)  EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);  /* - * Return the number of RCU callbacks still queued from the specified - * CPU, which must be a nocbs CPU. + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure.   */ -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +static void show_rcu_nocb_gp_state(struct rcu_data *rdp)  { -	return atomic_long_read(&rdp->nocb_q_count); +	struct rcu_node *rnp = rdp->mynode; + +	pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", +		rdp->cpu, +		"kK"[!!rdp->nocb_gp_kthread], +		"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], +		"dD"[!!rdp->nocb_defer_wakeup], +		"tT"[timer_pending(&rdp->nocb_timer)], +		"bB"[timer_pending(&rdp->nocb_bypass_timer)], +		"sS"[!!rdp->nocb_gp_sleep], +		".W"[swait_active(&rdp->nocb_gp_wq)], +		".W"[swait_active(&rnp->nocb_gp_wq[0])], +		".W"[swait_active(&rnp->nocb_gp_wq[1])], +		".B"[!!rdp->nocb_gp_bypass], +		".G"[!!rdp->nocb_gp_gp], +		(long)rdp->nocb_gp_seq, +		rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ +	struct rcu_segcblist *rsclp = &rdp->cblist; +	bool waslocked; +	bool wastimer; +	bool wassleep; + +	if (rdp->nocb_gp_rdp == rdp) +		show_rcu_nocb_gp_state(rdp); + +	pr_info("   CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", +		rdp->cpu, rdp->nocb_gp_rdp->cpu, +		"kK"[!!rdp->nocb_cb_kthread], +		"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], +		"cC"[!!atomic_read(&rdp->nocb_lock_contended)], +		"lL"[raw_spin_is_locked(&rdp->nocb_lock)], +		"sS"[!!rdp->nocb_cb_sleep], +		".W"[swait_active(&rdp->nocb_cb_wq)], +		jiffies - rdp->nocb_bypass_first, +		jiffies - rdp->nocb_nobypass_last, +		rdp->nocb_nobypass_count, +		".D"[rcu_segcblist_ready_cbs(rsclp)], +		".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], +		".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], +		".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], +		".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], +		rcu_segcblist_n_cbs(&rdp->cblist)); + +	/* It is OK for GP kthreads to have GP state. */ +	if (rdp->nocb_gp_rdp == rdp) +		return; + +	waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); +	wastimer = timer_pending(&rdp->nocb_timer); +	wassleep = swait_active(&rdp->nocb_gp_wq); +	if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && +	    !waslocked && !wastimer && !wassleep) +		return;  /* Nothing untowards. */ + +	pr_info("   !!! %c%c%c%c %c\n", +		"lL"[waslocked], +		"dD"[!!rdp->nocb_defer_wakeup], +		"tT"[wastimer], +		"sS"[!!rdp->nocb_gp_sleep], +		".W"[wassleep]);  }  #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) +/* No ->nocb_lock to acquire.  */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release.  */ +static void rcu_nocb_unlock(struct rcu_data *rdp)  { -	WARN_ON_ONCE(1); /* Should be dead code. */ -	return false; +} + +/* No ->nocb_lock to release.  */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, +				       unsigned long flags) +{ +	local_irq_restore(flags); +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ +	lockdep_assert_irqs_disabled();  }  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) @@ -2252,19 +2482,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)  {  } -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, -			    bool lazy, unsigned long flags) +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				  unsigned long j)  { -	return false; +	return true;  } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, -						     struct rcu_data *rdp, -						     unsigned long flags) +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, +				bool *was_alldone, unsigned long flags)  {  	return false;  } +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, +				 unsigned long flags) +{ +	WARN_ON_ONCE(1);  /* Should be dead code! */ +} +  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)  {  } @@ -2286,14 +2521,8 @@ static void __init rcu_spawn_nocb_kthreads(void)  {  } -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ -	return false; -} - -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +static void show_rcu_nocb_state(struct rcu_data *rdp)  { -	return 0;  }  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..c0b8c458d8a6 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)  //  // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION  /*   * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)  	return ndetected;  } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPTION */  /*   * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)  {  	return 0;  } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPTION */  /*   * Dump stacks of all tasks running on stalled CPUs.  First try using @@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp)  		/* We haven't checked in, so go dump stack. */  		print_cpu_stall(); +		if (rcu_cpu_stall_ftrace_dump) +			rcu_ftrace_dump(DUMP_ALL);  	} else if (rcu_gp_in_progress() &&  		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && @@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp)  		/* They had a few time units to dump stack, so complain. */  		print_other_cpu_stall(gs2); +		if (rcu_cpu_stall_ftrace_dump) +			rcu_ftrace_dump(DUMP_ALL);  	}  } @@ -585,6 +589,11 @@ void show_rcu_gp_kthreads(void)  				cpu, (long)rdp->gp_seq_needed);  		}  	} +	for_each_possible_cpu(cpu) { +		rdp = per_cpu_ptr(&rcu_data, cpu); +		if (rcu_segcblist_is_offloaded(&rdp->cblist)) +			show_rcu_nocb_state(rdp); +	}  	/* sched_show_task(rcu_state.gp_kthread); */  }  EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..1861103662db 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0);  #ifdef CONFIG_DEBUG_LOCK_ALLOC  /** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? + * @ret:	Best guess answer if lockdep cannot be relied on   * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * Returns true if lockdep must be ignored, in which case *ret contains + * the best guess described below.  Otherwise returns false, in which + * case *ret tells the caller nothing and the caller should instead + * consult lockdep. + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an   * RCU-sched read-side critical section.  In absence of   * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side   * critical section unless it can prove otherwise.  Note that disabling @@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0);   * Check debug_lockdep_rcu_enabled() to prevent false positives during boot   * and while lockdep is disabled.   * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock().  The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes.  This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. + * Note that if the CPU is in the idle loop from an RCU point of view (ie: + * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * rcu_read_lock().  The reason for this is that RCU ignores CPUs that are + * in such a section, considering these as in extended quiescent state, + * so such a CPU is effectively never in an RCU read-side critical section + * regardless of what RCU primitives it invokes.  This state of affairs is + * required --- we need to keep an RCU-free window in idle where the CPU may + * possibly enter into low power mode. This way we can notice an extended + * quiescent state to other CPUs that started a grace period. Otherwise + * we would delay any grace period as long as we run in the idle task.   * - * Similarly, we avoid claiming an SRCU read lock held if the current + * Similarly, we avoid claiming an RCU read lock held if the current   * CPU is offline.   */ +static bool rcu_read_lock_held_common(bool *ret) +{ +	if (!debug_lockdep_rcu_enabled()) { +		*ret = 1; +		return true; +	} +	if (!rcu_is_watching()) { +		*ret = 0; +		return true; +	} +	if (!rcu_lockdep_current_cpu_online()) { +		*ret = 0; +		return true; +	} +	return false; +} +  int rcu_read_lock_sched_held(void)  { -	int lockdep_opinion = 0; +	bool ret; -	if (!debug_lockdep_rcu_enabled()) -		return 1; -	if (!rcu_is_watching()) -		return 0; -	if (!rcu_lockdep_current_cpu_online()) -		return 0; -	if (debug_locks) -		lockdep_opinion = lock_is_held(&rcu_sched_lock_map); -	return lockdep_opinion || !preemptible(); +	if (rcu_read_lock_held_common(&ret)) +		return ret; +	return lock_is_held(&rcu_sched_lock_map) || !preemptible();  }  EXPORT_SYMBOL(rcu_read_lock_sched_held);  #endif @@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);   */  bool rcu_gp_is_expedited(void)  { -	return rcu_expedited || atomic_read(&rcu_expedited_nesting) || -	       rcu_scheduler_active == RCU_SCHEDULER_INIT; +	return rcu_expedited || atomic_read(&rcu_expedited_nesting);  }  EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); @@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);   */  int rcu_read_lock_held(void)  { -	if (!debug_lockdep_rcu_enabled()) -		return 1; -	if (!rcu_is_watching()) -		return 0; -	if (!rcu_lockdep_current_cpu_online()) -		return 0; +	bool ret; + +	if (rcu_read_lock_held_common(&ret)) +		return ret;  	return lock_is_held(&rcu_lock_map);  }  EXPORT_SYMBOL_GPL(rcu_read_lock_held); @@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);   */  int rcu_read_lock_bh_held(void)  { -	if (!debug_lockdep_rcu_enabled()) -		return 1; -	if (!rcu_is_watching()) -		return 0; -	if (!rcu_lockdep_current_cpu_online()) -		return 0; +	bool ret; + +	if (rcu_read_lock_held_common(&ret)) +		return ret;  	return in_softirq() || irqs_disabled();  }  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); +int rcu_read_lock_any_held(void) +{ +	bool ret; + +	if (rcu_read_lock_held_common(&ret)) +		return ret; +	if (lock_is_held(&rcu_lock_map) || +	    lock_is_held(&rcu_bh_lock_map) || +	    lock_is_held(&rcu_sched_lock_map)) +		return 1; +	return !preemptible(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); +  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */  /** @@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);  #endif  #ifdef CONFIG_RCU_STALL_COMMON +int rcu_cpu_stall_ftrace_dump __read_mostly; +module_param(rcu_cpu_stall_ftrace_dump, int, 0644);  int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */  EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);  module_param(rcu_cpu_stall_suppress, int, 0644); diff --git a/kernel/resource.c b/kernel/resource.c index 7ea4306503c5..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,  	while (start < end &&  	       !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,  				    false, &res)) { -		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; -		end_pfn = (res.end + 1) >> PAGE_SHIFT; +		pfn = PFN_UP(res.start); +		end_pfn = PFN_DOWN(res.end + 1);  		if (end_pfn > pfn)  			ret = (*func)(pfn, end_pfn - pfn, arg);  		if (ret) @@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head)  EXPORT_SYMBOL(resource_list_free);  #ifdef CONFIG_DEVICE_PRIVATE -/** - * devm_request_free_mem_region - find free region for device private memory - * - * @dev: device struct to bind the resource to - * @size: size in bytes of the device memory to add - * @base: resource tree to look in - * - * This function tries to find an empty range of physical address big enough to - * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE - * memory, which in turn allocates struct pages. - */ -struct resource *devm_request_free_mem_region(struct device *dev, -		struct resource *base, unsigned long size) +static struct resource *__request_free_mem_region(struct device *dev, +		struct resource *base, unsigned long size, const char *name)  {  	resource_size_t end, addr;  	struct resource *res; @@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev,  				REGION_DISJOINT)  			continue; -		res = devm_request_mem_region(dev, addr, size, dev_name(dev)); +		if (dev) +			res = devm_request_mem_region(dev, addr, size, name); +		else +			res = request_mem_region(addr, size, name);  		if (!res)  			return ERR_PTR(-ENOMEM);  		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; @@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev,  	return ERR_PTR(-ERANGE);  } + +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, +		struct resource *base, unsigned long size) +{ +	return __request_free_mem_region(dev, base, size, dev_name(dev)); +}  EXPORT_SYMBOL_GPL(devm_request_free_mem_region); + +struct resource *request_free_mem_region(struct resource *base, +		unsigned long size, const char *name) +{ +	return __request_free_mem_region(NULL, base, size, name); +} +EXPORT_SYMBOL_GPL(request_free_mem_region); +  #endif /* CONFIG_DEVICE_PRIVATE */  static int __init strict_iomem(char *str) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df9f1fe5689b..7880f4f64d0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq)  {  	struct hrtimer *timer = &rq->hrtick_timer; -	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); +	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);  }  /* @@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay)  	 */  	delay = max_t(u64, delay, 10000LL);  	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -		      HRTIMER_MODE_REL_PINNED); +		      HRTIMER_MODE_REL_PINNED_HARD);  }  #endif /* CONFIG_SMP */ @@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq)  	rq->hrtick_csd.info = rq;  #endif -	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  	rq->hrtick_timer.function = hrtick;  }  #else	/* CONFIG_SCHED_HRTICK */ @@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)  }  #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); +  /* Max allowed minimum utilization */  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)  	return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);  } -static inline unsigned int uclamp_none(int clamp_id) +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)  {  	if (clamp_id == UCLAMP_MIN)  		return 0; @@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,  }  static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,  		  unsigned int clamp_value)  {  	/* @@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,  	return uclamp_none(UCLAMP_MIN);  } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,  				     unsigned int clamp_value)  {  	/* Reset max-clamp retention only on idle exit */ @@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,  }  static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, -				 unsigned int clamp_value) +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, +				   unsigned int clamp_value)  {  	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;  	int bucket_id = UCLAMP_BUCKETS - 1; @@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,  	return uclamp_idle_value(rq, clamp_id, clamp_value);  } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) +{ +	struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP +	struct uclamp_se uc_max; + +	/* +	 * Tasks in autogroups or root task group will be +	 * restricted by system defaults. +	 */ +	if (task_group_is_autogroup(task_group(p))) +		return uc_req; +	if (task_group(p) == &root_task_group) +		return uc_req; + +	uc_max = task_group(p)->uclamp[clamp_id]; +	if (uc_req.value > uc_max.value || !uc_req.user_defined) +		return uc_max; +#endif + +	return uc_req; +} +  /*   * The effective clamp bucket index of a task depends on, by increasing   * priority:   * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + *   group or in an autogroup   * - the system default clamp value, defined by the sysadmin   */  static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)  { -	struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);  	struct uclamp_se uc_max = uclamp_default[clamp_id];  	/* System default restrictions always apply */ @@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)  	return uc_req;  } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)  {  	struct uclamp_se uc_eff; @@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)   * for each bucket when all its RUNNABLE tasks require the same clamp.   */  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, -				    unsigned int clamp_id) +				    enum uclamp_id clamp_id)  {  	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];  	struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,   * enforce the expected state and warn.   */  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, -				    unsigned int clamp_id) +				    enum uclamp_id clamp_id)  {  	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];  	struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	if (unlikely(!p->sched_class->uclamp_enabled))  		return; @@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	if (unlikely(!p->sched_class->uclamp_enabled))  		return; @@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  		uclamp_rq_dec_id(rq, p, clamp_id);  } +static inline void +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +{ +	struct rq_flags rf; +	struct rq *rq; + +	/* +	 * Lock the task and the rq where the task is (or was) queued. +	 * +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the +	 * price to pay to safely serialize util_{min,max} updates with +	 * enqueues, dequeues and migration operations. +	 * This is the same locking schema used by __set_cpus_allowed_ptr(). +	 */ +	rq = task_rq_lock(p, &rf); + +	/* +	 * Setting the clamp bucket is serialized by task_rq_lock(). +	 * If the task is not yet RUNNABLE and its task_struct is not +	 * affecting a valid clamp bucket, the next time it's enqueued, +	 * it will already see the updated clamp bucket value. +	 */ +	if (!p->uclamp[clamp_id].active) { +		uclamp_rq_dec_id(rq, p, clamp_id); +		uclamp_rq_inc_id(rq, p, clamp_id); +	} + +	task_rq_unlock(rq, p, &rf); +} + +static inline void +uclamp_update_active_tasks(struct cgroup_subsys_state *css, +			   unsigned int clamps) +{ +	enum uclamp_id clamp_id; +	struct css_task_iter it; +	struct task_struct *p; + +	css_task_iter_start(css, 0, &it); +	while ((p = css_task_iter_next(&it))) { +		for_each_clamp_id(clamp_id) { +			if ((0x1 << clamp_id) & clamps) +				uclamp_update_active(p, clamp_id); +		} +	} +	css_task_iter_end(&it); +} + +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css); +static void uclamp_update_root_tg(void) +{ +	struct task_group *tg = &root_task_group; + +	uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], +		      sysctl_sched_uclamp_util_min, false); +	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], +		      sysctl_sched_uclamp_util_max, false); + +	rcu_read_lock(); +	cpu_util_update_eff(&root_task_group.css); +	rcu_read_unlock(); +} +#else +static void uclamp_update_root_tg(void) { } +#endif +  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,  				void __user *buffer, size_t *lenp,  				loff_t *ppos)  { +	bool update_root_tg = false;  	int old_min, old_max; -	static DEFINE_MUTEX(mutex);  	int result; -	mutex_lock(&mutex); +	mutex_lock(&uclamp_mutex);  	old_min = sysctl_sched_uclamp_util_min;  	old_max = sysctl_sched_uclamp_util_max; @@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,  	if (old_min != sysctl_sched_uclamp_util_min) {  		uclamp_se_set(&uclamp_default[UCLAMP_MIN],  			      sysctl_sched_uclamp_util_min, false); +		update_root_tg = true;  	}  	if (old_max != sysctl_sched_uclamp_util_max) {  		uclamp_se_set(&uclamp_default[UCLAMP_MAX],  			      sysctl_sched_uclamp_util_max, false); +		update_root_tg = true;  	} +	if (update_root_tg) +		uclamp_update_root_tg(); +  	/* -	 * Updating all the RUNNABLE task is expensive, keep it simple and do -	 * just a lazy update at each next enqueue time. +	 * We update all RUNNABLE tasks only when task groups are in use. +	 * Otherwise, keep it simple and do just a lazy update at each next +	 * task enqueue time.  	 */ +  	goto done;  undo:  	sysctl_sched_uclamp_util_min = old_min;  	sysctl_sched_uclamp_util_max = old_max;  done: -	mutex_unlock(&mutex); +	mutex_unlock(&uclamp_mutex);  	return result;  } @@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,  static void __setscheduler_uclamp(struct task_struct *p,  				  const struct sched_attr *attr)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	/*  	 * On scheduling class change, reset to default clamps for tasks @@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,  static void uclamp_fork(struct task_struct *p)  { -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	for_each_clamp_id(clamp_id)  		p->uclamp[clamp_id].active = false; @@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)  static void __init init_uclamp(void)  {  	struct uclamp_se uc_max = {}; -	unsigned int clamp_id; +	enum uclamp_id clamp_id;  	int cpu; +	mutex_init(&uclamp_mutex); +  	for_each_possible_cpu(cpu) {  		memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));  		cpu_rq(cpu)->uclamp_flags = 0; @@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)  	/* System defaults allow max clamp values for both indexes */  	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); -	for_each_clamp_id(clamp_id) +	for_each_clamp_id(clamp_id) {  		uclamp_default[clamp_id] = uc_max; +#ifdef CONFIG_UCLAMP_TASK_GROUP +		root_task_group.uclamp_req[clamp_id] = uc_max; +		root_task_group.uclamp[clamp_id] = uc_max; +#endif +	}  }  #else /* CONFIG_UCLAMP_TASK */ @@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  }  /* @@ -1537,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	if (cpumask_equal(p->cpus_ptr, new_mask))  		goto out; -	if (!cpumask_intersects(new_mask, cpu_valid_mask)) { +	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); +	if (dest_cpu >= nr_cpu_ids) {  		ret = -EINVAL;  		goto out;  	} @@ -1558,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	if (cpumask_test_cpu(task_cpu(p), new_mask))  		goto out; -	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);  	if (task_running(rq, p) || p->state == TASK_WAKING) {  		struct migration_arg arg = { p, dest_cpu };  		/* Need help from migration thread: drop lock and wait. */ @@ -3135,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)  		/* Task is done with its stack. */  		put_task_stack(prev); -		put_task_struct(prev); +		put_task_struct_rcu_user(prev);  	}  	tick_nohz_task_switch(); @@ -3214,12 +3333,8 @@ static __always_inline struct rq *  context_switch(struct rq *rq, struct task_struct *prev,  	       struct task_struct *next, struct rq_flags *rf)  { -	struct mm_struct *mm, *oldmm; -  	prepare_task_switch(rq, prev, next); -	mm = next->mm; -	oldmm = prev->active_mm;  	/*  	 * For paravirt, this is coupled with an exit in switch_to to  	 * combine the page table reload and the switch backend into @@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,  	arch_start_context_switch(prev);  	/* -	 * If mm is non-NULL, we pass through switch_mm(). If mm is -	 * NULL, we will pass through mmdrop() in finish_task_switch(). -	 * Both of these contain the full memory barrier required by -	 * membarrier after storing to rq->curr, before returning to -	 * user-space. +	 * kernel -> kernel   lazy + transfer active +	 *   user -> kernel   lazy + mmgrab() active +	 * +	 * kernel ->   user   switch + mmdrop() active +	 *   user ->   user   switch  	 */ -	if (!mm) { -		next->active_mm = oldmm; -		mmgrab(oldmm); -		enter_lazy_tlb(oldmm, next); -	} else -		switch_mm_irqs_off(oldmm, mm, next); +	if (!next->mm) {                                // to kernel +		enter_lazy_tlb(prev->active_mm, next); -	if (!prev->mm) { -		prev->active_mm = NULL; -		rq->prev_mm = oldmm; +		next->active_mm = prev->active_mm; +		if (prev->mm)                           // from user +			mmgrab(prev->active_mm); +		else +			prev->active_mm = NULL; +	} else {                                        // to user +		membarrier_switch_mm(rq, prev->active_mm, next->mm); +		/* +		 * sys_membarrier() requires an smp_mb() between setting +		 * rq->curr / membarrier_switch_mm() and returning to userspace. +		 * +		 * The below provides this either through switch_mm(), or in +		 * case 'prev->active_mm == next->mm' through +		 * finish_task_switch()'s mmdrop(). +		 */ +		switch_mm_irqs_off(prev->active_mm, next->mm, next); + +		if (!prev->mm) {                        // from kernel +			/* will mmdrop() in finish_task_switch(). */ +			rq->prev_mm = prev->active_mm; +			prev->active_mm = NULL; +		}  	}  	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); @@ -3486,8 +3616,36 @@ void scheduler_tick(void)  struct tick_work {  	int			cpu; +	atomic_t		state;  	struct delayed_work	work;  }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE	0 +#define TICK_SCHED_REMOTE_OFFLINING	1 +#define TICK_SCHED_REMOTE_RUNNING	2 + +/* + * State diagram for ->state: + * + * + *          TICK_SCHED_REMOTE_OFFLINE + *                    |   ^ + *                    |   | + *                    |   | sched_tick_remote() + *                    |   | + *                    |   | + *                    +--TICK_SCHED_REMOTE_OFFLINING + *                    |   ^ + *                    |   | + * sched_tick_start() |   | sched_tick_stop() + *                    |   | + *                    V   | + *          TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */  static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)  	struct task_struct *curr;  	struct rq_flags rf;  	u64 delta; +	int os;  	/*  	 * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)  	rq_lock_irq(rq, &rf);  	curr = rq->curr; -	if (is_idle_task(curr)) +	if (is_idle_task(curr) || cpu_is_offline(cpu))  		goto out_unlock;  	update_rq_clock(rq); @@ -3533,13 +3692,18 @@ out_requeue:  	/*  	 * Run the remote tick once per second (1Hz). This arbitrary  	 * frequency is large enough to avoid overload but short enough -	 * to keep scheduler internal stats reasonably up to date. +	 * to keep scheduler internal stats reasonably up to date.  But +	 * first update state to reflect hotplug activity if required.  	 */ -	queue_delayed_work(system_unbound_wq, dwork, HZ); +	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); +	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); +	if (os == TICK_SCHED_REMOTE_RUNNING) +		queue_delayed_work(system_unbound_wq, dwork, HZ);  }  static void sched_tick_start(int cpu)  { +	int os;  	struct tick_work *twork;  	if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3712,20 @@ static void sched_tick_start(int cpu)  	WARN_ON_ONCE(!tick_work_cpu);  	twork = per_cpu_ptr(tick_work_cpu, cpu); -	twork->cpu = cpu; -	INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -	queue_delayed_work(system_unbound_wq, &twork->work, HZ); +	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); +	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); +	if (os == TICK_SCHED_REMOTE_OFFLINE) { +		twork->cpu = cpu; +		INIT_DELAYED_WORK(&twork->work, sched_tick_remote); +		queue_delayed_work(system_unbound_wq, &twork->work, HZ); +	}  }  #ifdef CONFIG_HOTPLUG_CPU  static void sched_tick_stop(int cpu)  {  	struct tick_work *twork; +	int os;  	if (housekeeping_cpu(cpu, HK_FLAG_TICK))  		return; @@ -3564,7 +3733,10 @@ static void sched_tick_stop(int cpu)  	WARN_ON_ONCE(!tick_work_cpu);  	twork = per_cpu_ptr(tick_work_cpu, cpu); -	cancel_delayed_work_sync(&twork->work); +	/* There cannot be competing actions, but don't rely on stop-machine. */ +	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); +	WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); +	/* Don't cancel, as this would mess up the state machine. */  }  #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3744,6 @@ int __init sched_tick_offload_init(void)  {  	tick_work_cpu = alloc_percpu(struct tick_work);  	BUG_ON(!tick_work_cpu); -  	return 0;  } @@ -3581,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }  static inline void sched_tick_stop(int cpu) { }  #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_TRACE_PREEMPT_TOGGLE))  /*   * If the value passed in is equal to the current preempt count @@ -3700,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)  /*   * Various schedule()-time debugging checks and statistics:   */ -static inline void schedule_debug(struct task_struct *prev) +static inline void schedule_debug(struct task_struct *prev, bool preempt)  {  #ifdef CONFIG_SCHED_STACK_END_CHECK  	if (task_stack_end_corrupted(prev))  		panic("corrupted stack end detected inside scheduler\n");  #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +	if (!preempt && prev->state && prev->non_block_count) { +		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", +			prev->comm, prev->pid, prev->non_block_count); +		dump_stack(); +		add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +	} +#endif +  	if (unlikely(in_atomic_preempt_off())) {  		__schedule_bug(prev);  		preempt_count_set(PREEMPT_DISABLED); @@ -3739,7 +3919,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  		p = fair_sched_class.pick_next_task(rq, prev, rf);  		if (unlikely(p == RETRY_TASK)) -			goto again; +			goto restart;  		/* Assumes fair_sched_class->next == idle_sched_class */  		if (unlikely(!p)) @@ -3748,14 +3928,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  		return p;  	} -again: +restart: +	/* +	 * Ensure that we put DL/RT tasks before the pick loop, such that they +	 * can PULL higher prio tasks when we lower the RQ 'priority'. +	 */ +	prev->sched_class->put_prev_task(rq, prev, rf); +	if (!rq->nr_running) +		newidle_balance(rq, rf); +  	for_each_class(class) { -		p = class->pick_next_task(rq, prev, rf); -		if (p) { -			if (unlikely(p == RETRY_TASK)) -				goto again; +		p = class->pick_next_task(rq, NULL, NULL); +		if (p)  			return p; -		}  	}  	/* The idle class should always have a runnable task: */ @@ -3782,7 +3967,7 @@ again:   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets   *      called on the nearest possible occasion:   * - *       - If the kernel is preemptible (CONFIG_PREEMPT=y): + *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):   *   *         - in syscall or exception context, at the next outmost   *           preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3976,7 @@ again:   *         - in IRQ context, return from interrupt-handler to   *           preemptible context   * - *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)   *         then at the next:   *   *          - cond_resched() call @@ -3813,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt)  	rq = cpu_rq(cpu);  	prev = rq->curr; -	schedule_debug(prev); +	schedule_debug(prev, preempt);  	if (sched_feat(HRTICK))  		hrtick_clear(rq); @@ -3857,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt)  	if (likely(prev != next)) {  		rq->nr_switches++; -		rq->curr = next; +		/* +		 * RCU users of rcu_dereference(rq->curr) may not see +		 * changes to task_struct made by pick_next_task(). +		 */ +		RCU_INIT_POINTER(rq->curr, next);  		/*  		 * The membarrier system call requires each architecture  		 * to have a full memory barrier after updating @@ -4036,11 +4225,10 @@ static void __sched notrace preempt_schedule_common(void)  	} while (need_resched());  } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION  /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable.   */  asmlinkage __visible void __sched notrace preempt_schedule(void)  { @@ -4108,10 +4296,10 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)  }  EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */  /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption   * off of irq context.   * Note, that this is called and return with irqs disabled. This will   * protect us against recursive calling from irq. @@ -4276,7 +4464,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)  	if (queued)  		enqueue_task(rq, p, queue_flag);  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock: @@ -4343,7 +4531,7 @@ void set_user_nice(struct task_struct *p, long nice)  			resched_curr(rq);  	}  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  out_unlock:  	task_rq_unlock(rq, p, &rf);  } @@ -4660,6 +4848,9 @@ recheck:  			return retval;  	} +	if (pi) +		cpuset_read_lock(); +  	/*  	 * Make sure no PI-waiters arrive (or leave) while we are  	 * changing the priority of the task: @@ -4674,8 +4865,8 @@ recheck:  	 * Changing the policy of the stop threads its a very bad idea:  	 */  	if (p == rq->stop) { -		task_rq_unlock(rq, p, &rf); -		return -EINVAL; +		retval = -EINVAL; +		goto unlock;  	}  	/* @@ -4693,8 +4884,8 @@ recheck:  			goto change;  		p->sched_reset_on_fork = reset_on_fork; -		task_rq_unlock(rq, p, &rf); -		return 0; +		retval = 0; +		goto unlock;  	}  change: @@ -4707,8 +4898,8 @@ change:  		if (rt_bandwidth_enabled() && rt_policy(policy) &&  				task_group(p)->rt_bandwidth.rt_runtime == 0 &&  				!task_group_is_autogroup(task_group(p))) { -			task_rq_unlock(rq, p, &rf); -			return -EPERM; +			retval = -EPERM; +			goto unlock;  		}  #endif  #ifdef CONFIG_SMP @@ -4723,8 +4914,8 @@ change:  			 */  			if (!cpumask_subset(span, p->cpus_ptr) ||  			    rq->rd->dl_bw.bw == 0) { -				task_rq_unlock(rq, p, &rf); -				return -EPERM; +				retval = -EPERM; +				goto unlock;  			}  		}  #endif @@ -4734,6 +4925,8 @@ change:  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {  		policy = oldpolicy = -1;  		task_rq_unlock(rq, p, &rf); +		if (pi) +			cpuset_read_unlock();  		goto recheck;  	} @@ -4743,8 +4936,8 @@ change:  	 * is available.  	 */  	if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { -		task_rq_unlock(rq, p, &rf); -		return -EBUSY; +		retval = -EBUSY; +		goto unlock;  	}  	p->sched_reset_on_fork = reset_on_fork; @@ -4786,7 +4979,7 @@ change:  		enqueue_task(rq, p, queue_flags);  	}  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  	check_class_changed(rq, p, prev_class, oldprio); @@ -4794,14 +4987,22 @@ change:  	preempt_disable();  	task_rq_unlock(rq, p, &rf); -	if (pi) +	if (pi) { +		cpuset_read_unlock();  		rt_mutex_adjust_pi(p); +	}  	/* Run balance callbacks after we've adjusted the PI chain: */  	balance_callback(rq);  	preempt_enable();  	return 0; + +unlock: +	task_rq_unlock(rq, p, &rf); +	if (pi) +		cpuset_read_unlock(); +	return retval;  }  static int _sched_setscheduler(struct task_struct *p, int policy, @@ -4885,10 +5086,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)  	rcu_read_lock();  	retval = -ESRCH;  	p = find_process_by_pid(pid); -	if (p != NULL) -		retval = sched_setscheduler(p, policy, &lparam); +	if (likely(p)) +		get_task_struct(p);  	rcu_read_unlock(); +	if (likely(p)) { +		retval = sched_setscheduler(p, policy, &lparam); +		put_task_struct(p); +	} +  	return retval;  } @@ -5419,7 +5625,7 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION  int __sched _cond_resched(void)  {  	if (should_resched(0)) { @@ -5436,7 +5642,7 @@ EXPORT_SYMBOL(_cond_resched);   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,   * call schedule, and on return reacquire the lock.   * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level   * operations here to prevent schedule() from being called twice (once via   * spin_unlock(), once by hand).   */ @@ -5866,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu)  	__set_task_cpu(idle, cpu);  	rcu_read_unlock(); -	rq->curr = rq->idle = idle; +	rq->idle = idle; +	rcu_assign_pointer(rq->curr, idle);  	idle->on_rq = TASK_ON_RQ_QUEUED;  #ifdef CONFIG_SMP  	idle->on_cpu = 1; @@ -5975,7 +6182,7 @@ void sched_setnuma(struct task_struct *p, int nid)  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);  	if (running) -		set_curr_task(rq, p); +		set_next_task(rq, p);  	task_rq_unlock(rq, p, &rf);  }  #endif /* CONFIG_NUMA_BALANCING */ @@ -6015,21 +6222,22 @@ static void calc_load_migrate(struct rq *rq)  		atomic_long_add(delta, &calc_load_tasks);  } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq)  { -} +	const struct sched_class *class; +	struct task_struct *next; -static const struct sched_class fake_sched_class = { -	.put_prev_task = put_prev_task_fake, -}; +	for_each_class(class) { +		next = class->pick_next_task(rq, NULL, NULL); +		if (next) { +			next->sched_class->put_prev_task(rq, next, NULL); +			return next; +		} +	} -static struct task_struct fake_task = { -	/* -	 * Avoid pull_{rt,dl}_task() -	 */ -	.prio = MAX_PRIO + 1, -	.sched_class = &fake_sched_class, -}; +	/* The idle class should always have a runnable task */ +	BUG(); +}  /*   * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6072,12 +6280,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)  		if (rq->nr_running == 1)  			break; -		/* -		 * pick_next_task() assumes pinned rq->lock: -		 */ -		next = pick_next_task(rq, &fake_task, rf); -		BUG_ON(!next); -		put_prev_task(rq, next); +		next = __pick_migrate_task(rq);  		/*  		 * Rules for changing task_struct::cpus_mask are holding @@ -6231,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu)  	}  	rq_unlock_irqrestore(rq, &rf); -	update_max_interval(); -  	return 0;  } @@ -6374,19 +6575,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);  void __init sched_init(void)  { -	unsigned long alloc_size = 0, ptr; +	unsigned long ptr = 0;  	int i;  	wait_bit_init();  #ifdef CONFIG_FAIR_GROUP_SCHED -	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +	ptr += 2 * nr_cpu_ids * sizeof(void **);  #endif  #ifdef CONFIG_RT_GROUP_SCHED -	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +	ptr += 2 * nr_cpu_ids * sizeof(void **);  #endif -	if (alloc_size) { -		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); +	if (ptr) { +		ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);  #ifdef CONFIG_FAIR_GROUP_SCHED  		root_task_group.se = (struct sched_entity **)ptr; @@ -6573,7 +6774,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  	rcu_sleep_check();  	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -	     !is_idle_task(current)) || +	     !is_idle_task(current) && !current->non_block_count) ||  	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||  	    oops_in_progress)  		return; @@ -6589,8 +6790,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  		"BUG: sleeping function called from invalid context at %s:%d\n",  			file, line);  	printk(KERN_ERR -		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -			in_atomic(), irqs_disabled(), +		"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", +			in_atomic(), irqs_disabled(), current->non_block_count,  			current->pid, current->comm);  	if (task_stack_end_corrupted(current)) @@ -6705,7 +6906,7 @@ struct task_struct *curr_task(int cpu)  #ifdef CONFIG_IA64  /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU.   * @cpu: the processor in question.   * @p: the task pointer to set.   * @@ -6730,6 +6931,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)  /* task_group_lock serializes the addition/removal of task groups */  static DEFINE_SPINLOCK(task_group_lock); +static inline void alloc_uclamp_sched_group(struct task_group *tg, +					    struct task_group *parent) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP +	enum uclamp_id clamp_id; + +	for_each_clamp_id(clamp_id) { +		uclamp_se_set(&tg->uclamp_req[clamp_id], +			      uclamp_none(clamp_id), false); +		tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; +	} +#endif +} +  static void sched_free_group(struct task_group *tg)  {  	free_fair_sched_group(tg); @@ -6753,6 +6968,8 @@ struct task_group *sched_create_group(struct task_group *parent)  	if (!alloc_rt_sched_group(tg, parent))  		goto err; +	alloc_uclamp_sched_group(tg, parent); +  	return tg;  err: @@ -6856,7 +7073,7 @@ void sched_move_task(struct task_struct *tsk)  	if (queued)  		enqueue_task(rq, tsk, queue_flags);  	if (running) -		set_curr_task(rq, tsk); +		set_next_task(rq, tsk);  	task_rq_unlock(rq, tsk, &rf);  } @@ -6939,10 +7156,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)  #ifdef CONFIG_RT_GROUP_SCHED  		if (!sched_rt_can_attach(css_tg(css), task))  			return -EINVAL; -#else -		/* We don't support RT-tasks being in separate groups */ -		if (task->sched_class != &fair_sched_class) -			return -EINVAL;  #endif  		/*  		 * Serialize against wake_up_new_task() such that if its @@ -6973,6 +7186,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)  		sched_move_task(task);  } +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css) +{ +	struct cgroup_subsys_state *top_css = css; +	struct uclamp_se *uc_parent = NULL; +	struct uclamp_se *uc_se = NULL; +	unsigned int eff[UCLAMP_CNT]; +	enum uclamp_id clamp_id; +	unsigned int clamps; + +	css_for_each_descendant_pre(css, top_css) { +		uc_parent = css_tg(css)->parent +			? css_tg(css)->parent->uclamp : NULL; + +		for_each_clamp_id(clamp_id) { +			/* Assume effective clamps matches requested clamps */ +			eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; +			/* Cap effective clamps with parent's effective clamps */ +			if (uc_parent && +			    eff[clamp_id] > uc_parent[clamp_id].value) { +				eff[clamp_id] = uc_parent[clamp_id].value; +			} +		} +		/* Ensure protection is always capped by limit */ +		eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); + +		/* Propagate most restrictive effective clamps */ +		clamps = 0x0; +		uc_se = css_tg(css)->uclamp; +		for_each_clamp_id(clamp_id) { +			if (eff[clamp_id] == uc_se[clamp_id].value) +				continue; +			uc_se[clamp_id].value = eff[clamp_id]; +			uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); +			clamps |= (0x1 << clamp_id); +		} +		if (!clamps) { +			css = css_rightmost_descendant(css); +			continue; +		} + +		/* Immediately update descendants RUNNABLE tasks */ +		uclamp_update_active_tasks(css, clamps); +	} +} + +/* + * Integer 10^N with a given N exponent by casting to integer the literal "1eN" + * C expression. Since there is no way to convert a macro argument (N) into a + * character constant, use two levels of macros. + */ +#define _POW10(exp) ((unsigned int)1e##exp) +#define POW10(exp) _POW10(exp) + +struct uclamp_request { +#define UCLAMP_PERCENT_SHIFT	2 +#define UCLAMP_PERCENT_SCALE	(100 * POW10(UCLAMP_PERCENT_SHIFT)) +	s64 percent; +	u64 util; +	int ret; +}; + +static inline struct uclamp_request +capacity_from_percent(char *buf) +{ +	struct uclamp_request req = { +		.percent = UCLAMP_PERCENT_SCALE, +		.util = SCHED_CAPACITY_SCALE, +		.ret = 0, +	}; + +	buf = strim(buf); +	if (strcmp(buf, "max")) { +		req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, +					     &req.percent); +		if (req.ret) +			return req; +		if (req.percent > UCLAMP_PERCENT_SCALE) { +			req.ret = -ERANGE; +			return req; +		} + +		req.util = req.percent << SCHED_CAPACITY_SHIFT; +		req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); +	} + +	return req; +} + +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, +				size_t nbytes, loff_t off, +				enum uclamp_id clamp_id) +{ +	struct uclamp_request req; +	struct task_group *tg; + +	req = capacity_from_percent(buf); +	if (req.ret) +		return req.ret; + +	mutex_lock(&uclamp_mutex); +	rcu_read_lock(); + +	tg = css_tg(of_css(of)); +	if (tg->uclamp_req[clamp_id].value != req.util) +		uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); + +	/* +	 * Because of not recoverable conversion rounding we keep track of the +	 * exact requested value +	 */ +	tg->uclamp_pct[clamp_id] = req.percent; + +	/* Update effective clamps to track the most restrictive value */ +	cpu_util_update_eff(of_css(of)); + +	rcu_read_unlock(); +	mutex_unlock(&uclamp_mutex); + +	return nbytes; +} + +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, +				    char *buf, size_t nbytes, +				    loff_t off) +{ +	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); +} + +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, +				    char *buf, size_t nbytes, +				    loff_t off) +{ +	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); +} + +static inline void cpu_uclamp_print(struct seq_file *sf, +				    enum uclamp_id clamp_id) +{ +	struct task_group *tg; +	u64 util_clamp; +	u64 percent; +	u32 rem; + +	rcu_read_lock(); +	tg = css_tg(seq_css(sf)); +	util_clamp = tg->uclamp_req[clamp_id].value; +	rcu_read_unlock(); + +	if (util_clamp == SCHED_CAPACITY_SCALE) { +		seq_puts(sf, "max\n"); +		return; +	} + +	percent = tg->uclamp_pct[clamp_id]; +	percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); +	seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); +} + +static int cpu_uclamp_min_show(struct seq_file *sf, void *v) +{ +	cpu_uclamp_print(sf, UCLAMP_MIN); +	return 0; +} + +static int cpu_uclamp_max_show(struct seq_file *sf, void *v) +{ +	cpu_uclamp_print(sf, UCLAMP_MAX); +	return 0; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ +  #ifdef CONFIG_FAIR_GROUP_SCHED  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,  				struct cftype *cftype, u64 shareval) @@ -7318,6 +7703,20 @@ static struct cftype cpu_legacy_files[] = {  		.write_u64 = cpu_rt_period_write_uint,  	},  #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP +	{ +		.name = "uclamp.min", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_min_show, +		.write = cpu_uclamp_min_write, +	}, +	{ +		.name = "uclamp.max", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_max_show, +		.write = cpu_uclamp_max_write, +	}, +#endif  	{ }	/* Terminate */  }; @@ -7485,6 +7884,20 @@ static struct cftype cpu_files[] = {  		.write = cpu_max_write,  	},  #endif +#ifdef CONFIG_UCLAMP_TASK_GROUP +	{ +		.name = "uclamp.min", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_min_show, +		.write = cpu_uclamp_min_write, +	}, +	{ +		.name = "uclamp.max", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cpu_uclamp_max_show, +		.write = cpu_uclamp_max_write, +	}, +#endif  	{ }	/* terminate */  }; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 867b4bb6d4be..86800b4d5453 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -117,6 +117,7 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,  			      unsigned int next_freq)  {  	struct cpufreq_policy *policy = sg_policy->policy; +	int cpu;  	if (!sugov_update_next_freq(sg_policy, time, next_freq))  		return; @@ -126,7 +127,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,  		return;  	policy->cur = next_freq; -	trace_cpu_frequency(next_freq, smp_processor_id()); + +	if (trace_cpu_frequency_enabled()) { +		for_each_cpu(cpu, policy->cpus) +			trace_cpu_frequency(next_freq, cpu); +	}  }  static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, @@ -263,9 +268,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,  	 * irq metric. Because IRQ/steal time is hidden from the task clock we  	 * need to scale the task numbers:  	 * -	 *              1 - irq -	 *   U' = irq + ------- * U -	 *                max +	 *              max - irq +	 *   U' = irq + --------- * U +	 *                 max  	 */  	util = scale_irq_capacity(util, irq, max);  	util += irq; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 46122edd8552..2dc48720f189 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p)  	dl_se->dl_non_contending = 1;  	get_task_struct(p); -	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);  }  static void task_contending(struct sched_dl_entity *dl_se, int flags) @@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);  static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)  {  	struct rq *later_rq = NULL; +	struct dl_bw *dl_b;  	later_rq = find_lock_later_rq(p, rq);  	if (!later_rq) { @@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p  		double_lock_balance(rq, later_rq);  	} +	if (p->dl.dl_non_contending || p->dl.dl_throttled) { +		/* +		 * Inactive timer is armed (or callback is running, but +		 * waiting for us to release rq locks). In any case, when it +		 * will fire (or continue), it will see running_bw of this +		 * task migrated to later_rq (and correctly handle it). +		 */ +		sub_running_bw(&p->dl, &rq->dl); +		sub_rq_bw(&p->dl, &rq->dl); + +		add_rq_bw(&p->dl, &later_rq->dl); +		add_running_bw(&p->dl, &later_rq->dl); +	} else { +		sub_rq_bw(&p->dl, &rq->dl); +		add_rq_bw(&p->dl, &later_rq->dl); +	} + +	/* +	 * And we finally need to fixup root_domain(s) bandwidth accounting, +	 * since p is still hanging out in the old (now moved to default) root +	 * domain. +	 */ +	dl_b = &rq->rd->dl_bw; +	raw_spin_lock(&dl_b->lock); +	__dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); +	raw_spin_unlock(&dl_b->lock); + +	dl_b = &later_rq->rd->dl_bw; +	raw_spin_lock(&dl_b->lock); +	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span)); +	raw_spin_unlock(&dl_b->lock); +  	set_task_cpu(p, later_rq->cpu);  	double_unlock_balance(later_rq, rq); @@ -923,7 +956,7 @@ static int start_dl_timer(struct task_struct *p)  	 */  	if (!hrtimer_is_queued(timer)) {  		get_task_struct(p); -		hrtimer_start(timer, act, HRTIMER_MODE_ABS); +		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);  	}  	return 1; @@ -1053,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)  {  	struct hrtimer *timer = &dl_se->dl_timer; -	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  	timer->function = dl_task_timer;  } @@ -1292,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)  {  	struct hrtimer *timer = &dl_se->inactive_timer; -	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  	timer->function = inactive_task_timer;  } @@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)  }  #endif -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p)  {  	p->se.exec_start = rq_clock_task(rq);  	/* You can't push away the running task */  	dequeue_pushable_dl_task(rq, p); + +	if (hrtick_enabled(rq)) +		start_hrtick_dl(rq, p); + +	if (rq->curr->sched_class != &dl_sched_class) +		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + +	deadline_queue_push_tasks(rq);  }  static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  	struct task_struct *p;  	struct dl_rq *dl_rq; -	dl_rq = &rq->dl; +	WARN_ON_ONCE(prev || rf); -	if (need_pull_dl_task(rq, prev)) { -		/* -		 * This is OK, because current is on_cpu, which avoids it being -		 * picked for load-balance and preemption/IRQs are still -		 * disabled avoiding further scheduler activity on it and we're -		 * being very careful to re-start the picking loop. -		 */ -		rq_unpin_lock(rq, rf); -		pull_dl_task(rq); -		rq_repin_lock(rq, rf); -		/* -		 * pull_dl_task() can drop (and re-acquire) rq->lock; this -		 * means a stop task can slip in, in which case we need to -		 * re-start task selection. -		 */ -		if (rq->stop && task_on_rq_queued(rq->stop)) -			return RETRY_TASK; -	} - -	/* -	 * When prev is DL, we may throttle it in put_prev_task(). -	 * So, we update time before we check for dl_nr_running. -	 */ -	if (prev->sched_class == &dl_sched_class) -		update_curr_dl(rq); +	dl_rq = &rq->dl;  	if (unlikely(!dl_rq->dl_nr_running))  		return NULL; -	put_prev_task(rq, prev); -  	dl_se = pick_next_dl_entity(rq, dl_rq);  	BUG_ON(!dl_se);  	p = dl_task_of(dl_se); -	set_next_task(rq, p); - -	if (hrtick_enabled(rq)) -		start_hrtick_dl(rq, p); - -	deadline_queue_push_tasks(rq); - -	if (rq->curr->sched_class != &dl_sched_class) -		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); +	set_next_task_dl(rq, p);  	return p;  } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)  {  	update_curr_dl(rq);  	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);  	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)  		enqueue_pushable_dl_task(rq, p); + +	if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { +		/* +		 * This is OK, because current is on_cpu, which avoids it being +		 * picked for load-balance and preemption/IRQs are still +		 * disabled avoiding further scheduler activity on it and we've +		 * not yet started the picking loop. +		 */ +		rq_unpin_lock(rq, rf); +		pull_dl_task(rq); +		rq_repin_lock(rq, rf); +	}  }  /* @@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)  	 */  } -static void set_curr_task_dl(struct rq *rq) -{ -	set_next_task(rq, rq->curr); -} -  #ifdef CONFIG_SMP  /* Only try algorithms three times */ @@ -2275,6 +2289,36 @@ void __init init_sched_dl_class(void)  					GFP_KERNEL, cpu_to_node(i));  } +void dl_add_task_root_domain(struct task_struct *p) +{ +	struct rq_flags rf; +	struct rq *rq; +	struct dl_bw *dl_b; + +	rq = task_rq_lock(p, &rf); +	if (!dl_task(p)) +		goto unlock; + +	dl_b = &rq->rd->dl_bw; +	raw_spin_lock(&dl_b->lock); + +	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + +	raw_spin_unlock(&dl_b->lock); + +unlock: +	task_rq_unlock(rq, p, &rf); +} + +void dl_clear_root_domain(struct root_domain *rd) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); +	rd->dl_bw.total_bw = 0; +	raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); +} +  #endif /* CONFIG_SMP */  static void switched_from_dl(struct rq *rq, struct task_struct *p) @@ -2395,6 +2439,7 @@ const struct sched_class dl_sched_class = {  	.pick_next_task		= pick_next_task_dl,  	.put_prev_task		= put_prev_task_dl, +	.set_next_task		= set_next_task_dl,  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_dl, @@ -2405,7 +2450,6 @@ const struct sched_class dl_sched_class = {  	.task_woken		= task_woken_dl,  #endif -	.set_curr_task		= set_curr_task_dl,  	.task_tick		= task_tick_dl,  	.task_fork              = task_fork_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 500f5db0de0b..83ab35e2374f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)  }  /* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 + * The margin used when comparing utilization with CPU capacity.   *   * (default: ~20%)   */ -static unsigned int capacity_margin			= 1280; +#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024) +  #endif  #ifdef CONFIG_CFS_BANDWIDTH @@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se)  	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */  } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);  static void attach_entity_cfs_rq(struct sched_entity *se);  /* @@ -1188,47 +1187,6 @@ static unsigned int task_scan_max(struct task_struct *p)  	return max(smin, smax);  } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ -	int mm_users = 0; -	struct mm_struct *mm = p->mm; - -	if (mm) { -		mm_users = atomic_read(&mm->mm_users); -		if (mm_users == 1) { -			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); -			mm->numa_scan_seq = 0; -		} -	} -	p->node_stamp			= 0; -	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0; -	p->numa_scan_period		= sysctl_numa_balancing_scan_delay; -	p->numa_work.next		= &p->numa_work; -	p->numa_faults			= NULL; -	RCU_INIT_POINTER(p->numa_group, NULL); -	p->last_task_numa_placement	= 0; -	p->last_sum_exec_runtime	= 0; - -	/* New address space, reset the preferred nid */ -	if (!(clone_flags & CLONE_VM)) { -		p->numa_preferred_nid = NUMA_NO_NODE; -		return; -	} - -	/* -	 * New thread, keep existing numa_preferred_nid which should be copied -	 * already by arch_dup_task_struct but stagger when scans start. -	 */ -	if (mm) { -		unsigned int delay; - -		delay = min_t(unsigned int, task_scan_max(current), -			current->numa_scan_period * mm_users * NSEC_PER_MSEC); -		delay += 2 * TICK_NSEC; -		p->node_stamp = delay; -	} -} -  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)  {  	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); @@ -1644,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env,  		return;  	rcu_read_lock(); -	cur = task_rcu_dereference(&dst_rq->curr); +	cur = rcu_dereference(dst_rq->curr);  	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))  		cur = NULL; @@ -2523,7 +2481,7 @@ static void reset_ptenuma_scan(struct task_struct *p)   * The expensive part of numa migration is done from task_work context.   * Triggered from task_tick_numa().   */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work)  {  	unsigned long migrate, next_scan, now = jiffies;  	struct task_struct *p = current; @@ -2536,7 +2494,7 @@ void task_numa_work(struct callback_head *work)  	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); -	work->next = work; /* protect against double add */ +	work->next = work;  	/*  	 * Who cares about NUMA placement when they're dying.  	 * @@ -2665,6 +2623,50 @@ out:  	}  } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +	int mm_users = 0; +	struct mm_struct *mm = p->mm; + +	if (mm) { +		mm_users = atomic_read(&mm->mm_users); +		if (mm_users == 1) { +			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); +			mm->numa_scan_seq = 0; +		} +	} +	p->node_stamp			= 0; +	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0; +	p->numa_scan_period		= sysctl_numa_balancing_scan_delay; +	/* Protect against double add, see task_tick_numa and task_numa_work */ +	p->numa_work.next		= &p->numa_work; +	p->numa_faults			= NULL; +	RCU_INIT_POINTER(p->numa_group, NULL); +	p->last_task_numa_placement	= 0; +	p->last_sum_exec_runtime	= 0; + +	init_task_work(&p->numa_work, task_numa_work); + +	/* New address space, reset the preferred nid */ +	if (!(clone_flags & CLONE_VM)) { +		p->numa_preferred_nid = NUMA_NO_NODE; +		return; +	} + +	/* +	 * New thread, keep existing numa_preferred_nid which should be copied +	 * already by arch_dup_task_struct but stagger when scans start. +	 */ +	if (mm) { +		unsigned int delay; + +		delay = min_t(unsigned int, task_scan_max(current), +			current->numa_scan_period * mm_users * NSEC_PER_MSEC); +		delay += 2 * TICK_NSEC; +		p->node_stamp = delay; +	} +} +  /*   * Drive the periodic memory faults..   */ @@ -2693,10 +2695,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)  			curr->numa_scan_period = task_scan_start(curr);  		curr->node_stamp += period; -		if (!time_before(jiffies, curr->mm->numa_next_scan)) { -			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ +		if (!time_before(jiffies, curr->mm->numa_next_scan))  			task_work_add(curr, work, true); -		}  	}  } @@ -3689,8 +3689,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)  	return cfs_rq->avg.load_avg;  } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); -  static inline unsigned long task_util(struct task_struct *p)  {  	return READ_ONCE(p->se.avg.util_avg); @@ -3807,7 +3805,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)  static inline int task_fits_capacity(struct task_struct *p, long capacity)  { -	return capacity * 1024 > task_util_est(p) * capacity_margin; +	return fits_capacity(task_util_est(p), capacity);  }  static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4355,23 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void)  }  /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock.   *   * requires cfs_b->lock   */  void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)  { -	u64 now; - -	if (cfs_b->quota == RUNTIME_INF) -		return; - -	now = sched_clock_cpu(smp_processor_id()); -	cfs_b->runtime = cfs_b->quota; -	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); -	cfs_b->expires_seq++; +	if (cfs_b->quota != RUNTIME_INF) +		cfs_b->runtime = cfs_b->quota;  }  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4379,22 +4370,12 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  	return &tg->cfs_bandwidth;  } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ -	if (unlikely(cfs_rq->throttle_count)) -		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - -	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} -  /* returns 0 on failure to allocate runtime */  static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  {  	struct task_group *tg = cfs_rq->tg;  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); -	u64 amount = 0, min_amount, expires; -	int expires_seq; +	u64 amount = 0, min_amount;  	/* note: this is a positive sum as runtime_remaining <= 0 */  	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4411,61 +4392,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  			cfs_b->idle = 0;  		}  	} -	expires_seq = cfs_b->expires_seq; -	expires = cfs_b->runtime_expires;  	raw_spin_unlock(&cfs_b->lock);  	cfs_rq->runtime_remaining += amount; -	/* -	 * we may have advanced our local expiration to account for allowed -	 * spread between our sched_clock and the one on which runtime was -	 * issued. -	 */ -	if (cfs_rq->expires_seq != expires_seq) { -		cfs_rq->expires_seq = expires_seq; -		cfs_rq->runtime_expires = expires; -	}  	return cfs_rq->runtime_remaining > 0;  } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ -	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - -	/* if the deadline is ahead of our clock, nothing to do */ -	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) -		return; - -	if (cfs_rq->runtime_remaining < 0) -		return; - -	/* -	 * If the local deadline has passed we have to consider the -	 * possibility that our sched_clock is 'fast' and the global deadline -	 * has not truly expired. -	 * -	 * Fortunately we can check determine whether this the case by checking -	 * whether the global deadline(cfs_b->expires_seq) has advanced. -	 */ -	if (cfs_rq->expires_seq == cfs_b->expires_seq) { -		/* extend local deadline, drift is bounded above by 2 ticks */ -		cfs_rq->runtime_expires += TICK_NSEC; -	} else { -		/* global deadline is ahead, expiration has passed */ -		cfs_rq->runtime_remaining = 0; -	} -} -  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)  {  	/* dock delta_exec before expiring quota (as it could span periods) */  	cfs_rq->runtime_remaining -= delta_exec; -	expire_cfs_rq_runtime(cfs_rq);  	if (likely(cfs_rq->runtime_remaining > 0))  		return; @@ -4524,7 +4461,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)  	cfs_rq->throttle_count--;  	if (!cfs_rq->throttle_count) { -		/* adjust cfs_rq_clock_task() */  		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -  					     cfs_rq->throttled_clock_task; @@ -4556,7 +4492,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	struct rq *rq = rq_of(cfs_rq);  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);  	struct sched_entity *se; -	long task_delta, dequeue = 1; +	long task_delta, idle_task_delta, dequeue = 1;  	bool empty;  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4567,6 +4503,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	rcu_read_unlock();  	task_delta = cfs_rq->h_nr_running; +	idle_task_delta = cfs_rq->idle_h_nr_running;  	for_each_sched_entity(se) {  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);  		/* throttled entity or throttle-on-deactivate */ @@ -4576,6 +4513,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  		if (dequeue)  			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);  		qcfs_rq->h_nr_running -= task_delta; +		qcfs_rq->idle_h_nr_running -= idle_task_delta;  		if (qcfs_rq->load.weight)  			dequeue = 0; @@ -4615,7 +4553,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);  	struct sched_entity *se;  	int enqueue = 1; -	long task_delta; +	long task_delta, idle_task_delta;  	se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4635,6 +4573,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  		return;  	task_delta = cfs_rq->h_nr_running; +	idle_task_delta = cfs_rq->idle_h_nr_running;  	for_each_sched_entity(se) {  		if (se->on_rq)  			enqueue = 0; @@ -4643,6 +4582,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  		if (enqueue)  			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);  		cfs_rq->h_nr_running += task_delta; +		cfs_rq->idle_h_nr_running += idle_task_delta;  		if (cfs_rq_throttled(cfs_rq))  			break; @@ -4658,8 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  		resched_curr(rq);  } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, -		u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)  {  	struct cfs_rq *cfs_rq;  	u64 runtime; @@ -4684,7 +4623,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,  		remaining -= runtime;  		cfs_rq->runtime_remaining += runtime; -		cfs_rq->runtime_expires = expires;  		/* we check whether we're throttled above */  		if (cfs_rq->runtime_remaining > 0) @@ -4709,7 +4647,7 @@ next:   */  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)  { -	u64 runtime, runtime_expires; +	u64 runtime;  	int throttled;  	/* no need to continue the timer with no bandwidth constraint */ @@ -4737,8 +4675,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u  	/* account preceding periods in which throttling occurred */  	cfs_b->nr_throttled += overrun; -	runtime_expires = cfs_b->runtime_expires; -  	/*  	 * This check is repeated as we are holding onto the new bandwidth while  	 * we unthrottle. This can potentially race with an unthrottled group @@ -4751,8 +4687,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u  		cfs_b->distribute_running = 1;  		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  		/* we can't nest cfs_b->lock while distributing bandwidth */ -		runtime = distribute_cfs_runtime(cfs_b, runtime, -						 runtime_expires); +		runtime = distribute_cfs_runtime(cfs_b, runtime);  		raw_spin_lock_irqsave(&cfs_b->lock, flags);  		cfs_b->distribute_running = 0; @@ -4834,8 +4769,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)  		return;  	raw_spin_lock(&cfs_b->lock); -	if (cfs_b->quota != RUNTIME_INF && -	    cfs_rq->runtime_expires == cfs_b->runtime_expires) { +	if (cfs_b->quota != RUNTIME_INF) {  		cfs_b->runtime += slack_runtime;  		/* we are under rq->lock, defer unthrottling using a timer */ @@ -4868,7 +4802,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  {  	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();  	unsigned long flags; -	u64 expires;  	/* confirm we're still not at a refresh boundary */  	raw_spin_lock_irqsave(&cfs_b->lock, flags); @@ -4886,7 +4819,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)  		runtime = cfs_b->runtime; -	expires = cfs_b->runtime_expires;  	if (runtime)  		cfs_b->distribute_running = 1; @@ -4895,11 +4827,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	if (!runtime)  		return; -	runtime = distribute_cfs_runtime(cfs_b, runtime, expires); +	runtime = distribute_cfs_runtime(cfs_b, runtime);  	raw_spin_lock_irqsave(&cfs_b->lock, flags); -	if (expires == cfs_b->runtime_expires) -		lsub_positive(&cfs_b->runtime, runtime); +	lsub_positive(&cfs_b->runtime, runtime);  	cfs_b->distribute_running = 0;  	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  } @@ -5047,17 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  { -	u64 overrun; -  	lockdep_assert_held(&cfs_b->lock);  	if (cfs_b->period_active)  		return;  	cfs_b->period_active = 1; -	overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); -	cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); -	cfs_b->expires_seq++; +	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);  	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);  } @@ -5135,11 +5062,6 @@ static inline bool cfs_bandwidth_used(void)  	return false;  } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ -	return rq_clock_task(rq_of(cfs_rq)); -} -  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -5235,7 +5157,7 @@ static inline unsigned long cpu_util(int cpu);  static inline bool cpu_overutilized(int cpu)  { -	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +	return !fits_capacity(cpu_util(cpu), capacity_of(cpu));  }  static inline void update_overutilized_status(struct rq *rq) @@ -5259,6 +5181,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  {  	struct cfs_rq *cfs_rq;  	struct sched_entity *se = &p->se; +	int idle_h_nr_running = task_has_idle_policy(p);  	/*  	 * The code below (indirectly) updates schedutil which looks at @@ -5291,6 +5214,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		if (cfs_rq_throttled(cfs_rq))  			break;  		cfs_rq->h_nr_running++; +		cfs_rq->idle_h_nr_running += idle_h_nr_running;  		flags = ENQUEUE_WAKEUP;  	} @@ -5298,6 +5222,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se);  		cfs_rq->h_nr_running++; +		cfs_rq->idle_h_nr_running += idle_h_nr_running;  		if (cfs_rq_throttled(cfs_rq))  			break; @@ -5359,6 +5284,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	struct cfs_rq *cfs_rq;  	struct sched_entity *se = &p->se;  	int task_sleep = flags & DEQUEUE_SLEEP; +	int idle_h_nr_running = task_has_idle_policy(p);  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se); @@ -5373,6 +5299,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		if (cfs_rq_throttled(cfs_rq))  			break;  		cfs_rq->h_nr_running--; +		cfs_rq->idle_h_nr_running -= idle_h_nr_running;  		/* Don't dequeue parent if it has other entities besides us */  		if (cfs_rq->load.weight) { @@ -5392,6 +5319,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se);  		cfs_rq->h_nr_running--; +		cfs_rq->idle_h_nr_running -= idle_h_nr_running;  		if (cfs_rq_throttled(cfs_rq))  			break; @@ -5425,6 +5353,15 @@ static struct {  #endif /* CONFIG_NO_HZ_COMMON */ +/* CPU only has SCHED_IDLE tasks enqueued */ +static int sched_idle_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && +			rq->nr_running); +} +  static unsigned long cpu_runnable_load(struct rq *rq)  {  	return cfs_rq_runnable_load_avg(&rq->cfs); @@ -5747,7 +5684,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this  	unsigned int min_exit_latency = UINT_MAX;  	u64 latest_idle_timestamp = 0;  	int least_loaded_cpu = this_cpu; -	int shallowest_idle_cpu = -1; +	int shallowest_idle_cpu = -1, si_cpu = -1;  	int i;  	/* Check if we have any choice: */ @@ -5778,7 +5715,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this  				latest_idle_timestamp = rq->idle_stamp;  				shallowest_idle_cpu = i;  			} -		} else if (shallowest_idle_cpu == -1) { +		} else if (shallowest_idle_cpu == -1 && si_cpu == -1) { +			if (sched_idle_cpu(i)) { +				si_cpu = i; +				continue; +			} +  			load = cpu_runnable_load(cpu_rq(i));  			if (load < min_load) {  				min_load = load; @@ -5787,7 +5729,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this  		}  	} -	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; +	if (shallowest_idle_cpu != -1) +		return shallowest_idle_cpu; +	if (si_cpu != -1) +		return si_cpu; +	return least_loaded_cpu;  }  static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5940,7 +5886,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int   */  static int select_idle_smt(struct task_struct *p, int target)  { -	int cpu; +	int cpu, si_cpu = -1;  	if (!static_branch_likely(&sched_smt_present))  		return -1; @@ -5950,9 +5896,11 @@ static int select_idle_smt(struct task_struct *p, int target)  			continue;  		if (available_idle_cpu(cpu))  			return cpu; +		if (si_cpu == -1 && sched_idle_cpu(cpu)) +			si_cpu = cpu;  	} -	return -1; +	return si_cpu;  }  #else /* CONFIG_SCHED_SMT */ @@ -5980,8 +5928,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  	u64 avg_cost, avg_idle;  	u64 time, cost;  	s64 delta; -	int cpu, nr = INT_MAX;  	int this = smp_processor_id(); +	int cpu, nr = INT_MAX, si_cpu = -1;  	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));  	if (!this_sd) @@ -6009,11 +5957,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {  		if (!--nr) -			return -1; +			return si_cpu;  		if (!cpumask_test_cpu(cpu, p->cpus_ptr))  			continue;  		if (available_idle_cpu(cpu))  			break; +		if (si_cpu == -1 && sched_idle_cpu(cpu)) +			si_cpu = cpu;  	}  	time = cpu_clock(this) - time; @@ -6032,13 +5982,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	struct sched_domain *sd;  	int i, recent_used_cpu; -	if (available_idle_cpu(target)) +	if (available_idle_cpu(target) || sched_idle_cpu(target))  		return target;  	/*  	 * If the previous CPU is cache affine and idle, don't be stupid:  	 */ -	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) +	if (prev != target && cpus_share_cache(prev, target) && +	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))  		return prev;  	/* Check a recently used CPU as a potential idle candidate: */ @@ -6046,7 +5997,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	if (recent_used_cpu != prev &&  	    recent_used_cpu != target &&  	    cpus_share_cache(recent_used_cpu, target) && -	    available_idle_cpu(recent_used_cpu) && +	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&  	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {  		/*  		 * Replace recent_used_cpu with prev as it is a potential @@ -6282,69 +6233,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)  }  /* - * compute_energy(): Estimates the energy that would be consumed if @p was + * compute_energy(): Estimates the energy that @pd would consume if @p was   * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of the * CPUs after the task migration, and uses the Energy Model + * landscape of @pd's CPUs after the task migration, and uses the Energy Model   * to compute what would be the energy if we decided to actually migrate that   * task.   */  static long  compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)  { -	unsigned int max_util, util_cfs, cpu_util, cpu_cap; -	unsigned long sum_util, energy = 0; -	struct task_struct *tsk; +	struct cpumask *pd_mask = perf_domain_span(pd); +	unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); +	unsigned long max_util = 0, sum_util = 0;  	int cpu; -	for (; pd; pd = pd->next) { -		struct cpumask *pd_mask = perf_domain_span(pd); +	/* +	 * The capacity state of CPUs of the current rd can be driven by CPUs +	 * of another rd if they belong to the same pd. So, account for the +	 * utilization of these CPUs too by masking pd with cpu_online_mask +	 * instead of the rd span. +	 * +	 * If an entire pd is outside of the current rd, it will not appear in +	 * its pd list and will not be accounted by compute_energy(). +	 */ +	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { +		unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); +		struct task_struct *tsk = cpu == dst_cpu ? p : NULL;  		/* -		 * The energy model mandates all the CPUs of a performance -		 * domain have the same capacity. +		 * Busy time computation: utilization clamping is not +		 * required since the ratio (sum_util / cpu_capacity) +		 * is already enough to scale the EM reported power +		 * consumption at the (eventually clamped) cpu_capacity.  		 */ -		cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); -		max_util = sum_util = 0; +		sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, +					       ENERGY_UTIL, NULL);  		/* -		 * The capacity state of CPUs of the current rd can be driven by -		 * CPUs of another rd if they belong to the same performance -		 * domain. So, account for the utilization of these CPUs too -		 * by masking pd with cpu_online_mask instead of the rd span. -		 * -		 * If an entire performance domain is outside of the current rd, -		 * it will not appear in its pd list and will not be accounted -		 * by compute_energy(). +		 * Performance domain frequency: utilization clamping +		 * must be considered since it affects the selection +		 * of the performance domain frequency. +		 * NOTE: in case RT tasks are running, by default the +		 * FREQUENCY_UTIL's utilization can be max OPP.  		 */ -		for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { -			util_cfs = cpu_util_next(cpu, p, dst_cpu); - -			/* -			 * Busy time computation: utilization clamping is not -			 * required since the ratio (sum_util / cpu_capacity) -			 * is already enough to scale the EM reported power -			 * consumption at the (eventually clamped) cpu_capacity. -			 */ -			sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, -						       ENERGY_UTIL, NULL); - -			/* -			 * Performance domain frequency: utilization clamping -			 * must be considered since it affects the selection -			 * of the performance domain frequency. -			 * NOTE: in case RT tasks are running, by default the -			 * FREQUENCY_UTIL's utilization can be max OPP. -			 */ -			tsk = cpu == dst_cpu ? p : NULL; -			cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, -						      FREQUENCY_UTIL, tsk); -			max_util = max(max_util, cpu_util); -		} - -		energy += em_pd_energy(pd->em_pd, max_util, sum_util); +		cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, +					      FREQUENCY_UTIL, tsk); +		max_util = max(max_util, cpu_util);  	} -	return energy; +	return em_pd_energy(pd->em_pd, max_util, sum_util);  }  /* @@ -6386,21 +6323,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)   * other use-cases too. So, until someone finds a better way to solve this,   * let's keep things simple by re-using the existing slow path.   */ -  static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  { -	unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; +	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;  	struct root_domain *rd = cpu_rq(smp_processor_id())->rd; +	unsigned long cpu_cap, util, base_energy = 0;  	int cpu, best_energy_cpu = prev_cpu; -	struct perf_domain *head, *pd; -	unsigned long cpu_cap, util;  	struct sched_domain *sd; +	struct perf_domain *pd;  	rcu_read_lock();  	pd = rcu_dereference(rd->pd);  	if (!pd || READ_ONCE(rd->overutilized))  		goto fail; -	head = pd;  	/*  	 * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6417,9 +6352,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  		goto unlock;  	for (; pd; pd = pd->next) { -		unsigned long cur_energy, spare_cap, max_spare_cap = 0; +		unsigned long cur_delta, spare_cap, max_spare_cap = 0; +		unsigned long base_energy_pd;  		int max_spare_cap_cpu = -1; +		/* Compute the 'base' energy of the pd, without @p */ +		base_energy_pd = compute_energy(p, -1, pd); +		base_energy += base_energy_pd; +  		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {  			if (!cpumask_test_cpu(cpu, p->cpus_ptr))  				continue; @@ -6427,14 +6367,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  			/* Skip CPUs that will be overutilized. */  			util = cpu_util_next(cpu, p, cpu);  			cpu_cap = capacity_of(cpu); -			if (cpu_cap * 1024 < util * capacity_margin) +			if (!fits_capacity(util, cpu_cap))  				continue;  			/* Always use prev_cpu as a candidate. */  			if (cpu == prev_cpu) { -				prev_energy = compute_energy(p, prev_cpu, head); -				best_energy = min(best_energy, prev_energy); -				continue; +				prev_delta = compute_energy(p, prev_cpu, pd); +				prev_delta -= base_energy_pd; +				best_delta = min(best_delta, prev_delta);  			}  			/* @@ -6449,10 +6389,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  		}  		/* Evaluate the energy impact of using this CPU. */ -		if (max_spare_cap_cpu >= 0) { -			cur_energy = compute_energy(p, max_spare_cap_cpu, head); -			if (cur_energy < best_energy) { -				best_energy = cur_energy; +		if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { +			cur_delta = compute_energy(p, max_spare_cap_cpu, pd); +			cur_delta -= base_energy_pd; +			if (cur_delta < best_delta) { +				best_delta = cur_delta;  				best_energy_cpu = max_spare_cap_cpu;  			}  		} @@ -6464,10 +6405,10 @@ unlock:  	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at  	 * least 6% of the energy used by prev_cpu.  	 */ -	if (prev_energy == ULONG_MAX) +	if (prev_delta == ULONG_MAX)  		return best_energy_cpu; -	if ((prev_energy - best_energy) > (prev_energy >> 4)) +	if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))  		return best_energy_cpu;  	return prev_cpu; @@ -6801,7 +6742,7 @@ again:  		goto idle;  #ifdef CONFIG_FAIR_GROUP_SCHED -	if (prev->sched_class != &fair_sched_class) +	if (!prev || prev->sched_class != &fair_sched_class)  		goto simple;  	/* @@ -6878,8 +6819,8 @@ again:  	goto done;  simple:  #endif - -	put_prev_task(rq, prev); +	if (prev) +		put_prev_task(rq, prev);  	do {  		se = pick_next_entity(cfs_rq, NULL); @@ -6907,11 +6848,13 @@ done: __maybe_unused;  	return p;  idle: -	update_misfit_status(NULL, rq); -	new_tasks = idle_balance(rq, rf); +	if (!rf) +		return NULL; + +	new_tasks = newidle_balance(rq, rf);  	/* -	 * Because idle_balance() releases (and re-acquires) rq->lock, it is +	 * Because newidle_balance() releases (and re-acquires) rq->lock, it is  	 * possible for any higher priority task to appear. In that case we  	 * must re-start the pick_next_entity() loop.  	 */ @@ -6933,7 +6876,7 @@ idle:  /*   * Account for a descheduled task:   */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  {  	struct sched_entity *se = &prev->se;  	struct cfs_rq *cfs_rq; @@ -7435,7 +7378,7 @@ static int detach_tasks(struct lb_env *env)  		detached++;  		env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION  		/*  		 * NEWIDLE balancing is a source of latency, so preemptible  		 * kernels will stop after the first task is detached to minimize @@ -7982,8 +7925,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)  static inline bool  group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)  { -	return sg->sgc->min_capacity * capacity_margin < -						ref->sgc->min_capacity * 1024; +	return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);  }  /* @@ -7993,8 +7935,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)  static inline bool  group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)  { -	return sg->sgc->max_capacity * capacity_margin < -						ref->sgc->max_capacity * 1024; +	return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);  }  static inline enum @@ -9052,9 +8993,10 @@ more_balance:  out_balanced:  	/*  	 * We reach balance although we may have faced some affinity -	 * constraints. Clear the imbalance flag if it was set. +	 * constraints. Clear the imbalance flag only if other tasks got +	 * a chance to move and fix the imbalance.  	 */ -	if (sd_parent) { +	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {  		int *group_imbalance = &sd_parent->groups->sgc->imbalance;  		if (*group_imbalance) @@ -9075,10 +9017,10 @@ out_one_pinned:  	ld_moved = 0;  	/* -	 * idle_balance() disregards balance intervals, so we could repeatedly -	 * reach this code, which would lead to balance_interval skyrocketting -	 * in a short amount of time. Skip the balance_interval increase logic -	 * to avoid that. +	 * newidle_balance() disregards balance intervals, so we could +	 * repeatedly reach this code, which would lead to balance_interval +	 * skyrocketting in a short amount of time. Skip the balance_interval +	 * increase logic to avoid that.  	 */  	if (env.idle == CPU_NEWLY_IDLE)  		goto out; @@ -9788,7 +9730,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }   * idle_balance is called by schedule() if this_cpu is about to become   * idle. Attempts to pull tasks from other CPUs.   */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +int newidle_balance(struct rq *this_rq, struct rq_flags *rf)  {  	unsigned long next_balance = jiffies + HZ;  	int this_cpu = this_rq->cpu; @@ -9796,6 +9738,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)  	int pulled_task = 0;  	u64 curr_cost = 0; +	update_misfit_status(NULL, this_rq);  	/*  	 * We must set idle_stamp _before_ calling idle_balance(), such that we  	 * measure the duration of idle_balance() as idle time. @@ -10180,9 +10123,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)   * This routine is mostly called to set cfs_rq->curr field when a task   * migrates between groups/classes.   */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p)  { -	struct sched_entity *se = &rq->curr->se; +	struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP +	if (task_on_rq_queued(p)) { +		/* +		 * Move the next running task to the front of the list, so our +		 * cfs_tasks list becomes MRU one. +		 */ +		list_move(&se->group_node, &rq->cfs_tasks); +	} +#endif  	for_each_sched_entity(se) {  		struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10300,18 +10253,18 @@ err:  void online_fair_sched_group(struct task_group *tg)  {  	struct sched_entity *se; +	struct rq_flags rf;  	struct rq *rq;  	int i;  	for_each_possible_cpu(i) {  		rq = cpu_rq(i);  		se = tg->se[i]; - -		raw_spin_lock_irq(&rq->lock); +		rq_lock_irq(rq, &rf);  		update_rq_clock(rq);  		attach_entity_cfs_rq(se);  		sync_throttle(tg, i); -		raw_spin_unlock_irq(&rq->lock); +		rq_unlock_irq(rq, &rf);  	}  } @@ -10453,7 +10406,9 @@ const struct sched_class fair_sched_class = {  	.check_preempt_curr	= check_preempt_wakeup,  	.pick_next_task		= pick_next_task_fair, +  	.put_prev_task		= put_prev_task_fair, +	.set_next_task          = set_next_task_fair,  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_fair, @@ -10466,7 +10421,6 @@ const struct sched_class fair_sched_class = {  	.set_cpus_allowed	= set_cpus_allowed_common,  #endif -	.set_curr_task          = set_curr_task_fair,  	.task_tick		= task_tick_fair,  	.task_fork		= task_fork_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,16 +238,16 @@ static void do_idle(void)  	tick_nohz_idle_enter();  	while (!need_resched()) { -		check_pgt_cache();  		rmb(); +		local_irq_disable(); +  		if (cpu_is_offline(cpu)) { -			tick_nohz_idle_stop_tick_protected(); +			tick_nohz_idle_stop_tick();  			cpuhp_report_idle_dead();  			arch_cpu_idle_dead();  		} -		local_irq_disable();  		arch_cpu_idle_enter();  		/* @@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)  	return HRTIMER_NORESTART;  } -void play_idle(unsigned long duration_ms) +void play_idle(unsigned long duration_us)  {  	struct idle_timer it; @@ -323,7 +323,7 @@ void play_idle(unsigned long duration_ms)  	WARN_ON_ONCE(current->nr_cpus_allowed != 1);  	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));  	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); -	WARN_ON_ONCE(!duration_ms); +	WARN_ON_ONCE(!duration_us);  	rcu_sleep_check();  	preempt_disable(); @@ -333,7 +333,8 @@ void play_idle(unsigned long duration_ms)  	it.done = 0;  	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);  	it.timer.function = idle_inject_timer_fn; -	hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); +	hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC), +		      HRTIMER_MODE_REL_PINNED);  	while (!READ_ONCE(it.done))  		do_idle(); @@ -374,14 +375,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl  	resched_curr(rq);  } -static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ +} + +static void set_next_task_idle(struct rq *rq, struct task_struct *next)  { -	put_prev_task(rq, prev);  	update_idle_core(rq);  	schedstat_inc(rq->sched_goidle); +} + +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ +	struct task_struct *next = rq->idle; -	return rq->idle; +	if (prev) +		put_prev_task(rq, prev); + +	set_next_task_idle(rq, next); + +	return next;  }  /* @@ -397,10 +411,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)  	raw_spin_lock_irq(&rq->lock);  } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} -  /*   * scheduler tick hitting a task of our scheduling class.   * @@ -413,10 +423,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)  {  } -static void set_curr_task_idle(struct rq *rq) -{ -} -  static void switched_to_idle(struct rq *rq, struct task_struct *p)  {  	BUG(); @@ -451,13 +457,13 @@ const struct sched_class idle_sched_class = {  	.pick_next_task		= pick_next_task_idle,  	.put_prev_task		= put_prev_task_idle, +	.set_next_task          = set_next_task_idle,  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_idle,  	.set_cpus_allowed	= set_cpus_allowed_common,  #endif -	.set_curr_task          = set_curr_task_idle,  	.task_tick		= task_tick_idle,  	.get_rr_interval	= get_rr_interval_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index ccb28085b114..9fcb2a695a41 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled);  int housekeeping_any_cpu(enum hk_flags flags)  { -	if (static_branch_unlikely(&housekeeping_overridden)) -		if (housekeeping_flags & flags) +	int cpu; + +	if (static_branch_unlikely(&housekeeping_overridden)) { +		if (housekeeping_flags & flags) { +			cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); +			if (cpu < nr_cpu_ids) +				return cpu; +  			return cpumask_any_and(housekeeping_mask, cpu_online_mask); +		} +	}  	return smp_processor_id();  }  EXPORT_SYMBOL_GPL(housekeeping_any_cpu); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..a39bed2c784f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,10 +30,42 @@ static void ipi_mb(void *info)  	smp_mb();	/* IPIs should be serializing but paranoid. */  } +static void ipi_sync_rq_state(void *info) +{ +	struct mm_struct *mm = (struct mm_struct *) info; + +	if (current->mm != mm) +		return; +	this_cpu_write(runqueues.membarrier_state, +		       atomic_read(&mm->membarrier_state)); +	/* +	 * Issue a memory barrier after setting +	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to +	 * guarantee that no memory access following registration is reordered +	 * before registration. +	 */ +	smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ +	/* +	 * Issue a memory barrier before clearing membarrier_state to +	 * guarantee that no memory access prior to exec is reordered after +	 * clearing this state. +	 */ +	smp_mb(); +	atomic_set(&mm->membarrier_state, 0); +	/* +	 * Keep the runqueue membarrier_state in sync with this mm +	 * membarrier_state. +	 */ +	this_cpu_write(runqueues.membarrier_state, 0); +} +  static int membarrier_global_expedited(void)  {  	int cpu; -	bool fallback = false;  	cpumask_var_t tmpmask;  	if (num_online_cpus() == 1) @@ -45,17 +77,11 @@ static int membarrier_global_expedited(void)  	 */  	smp_mb();	/* system call entry is not a mb. */ -	/* -	 * Expedited membarrier commands guarantee that they won't -	 * block, hence the GFP_NOWAIT allocation flag and fallback -	 * implementation. -	 */ -	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { -		/* Fallback for OOM. */ -		fallback = true; -	} +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) +		return -ENOMEM;  	cpus_read_lock(); +	rcu_read_lock();  	for_each_online_cpu(cpu) {  		struct task_struct *p; @@ -70,23 +96,28 @@ static int membarrier_global_expedited(void)  		if (cpu == raw_smp_processor_id())  			continue; -		rcu_read_lock(); -		p = task_rcu_dereference(&cpu_rq(cpu)->curr); -		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & -				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { -			if (!fallback) -				__cpumask_set_cpu(cpu, tmpmask); -			else -				smp_call_function_single(cpu, ipi_mb, NULL, 1); -		} -		rcu_read_unlock(); -	} -	if (!fallback) { -		preempt_disable(); -		smp_call_function_many(tmpmask, ipi_mb, NULL, 1); -		preempt_enable(); -		free_cpumask_var(tmpmask); +		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & +		    MEMBARRIER_STATE_GLOBAL_EXPEDITED)) +			continue; + +		/* +		 * Skip the CPU if it runs a kernel thread. The scheduler +		 * leaves the prior task mm in place as an optimization when +		 * scheduling a kthread. +		 */ +		p = rcu_dereference(cpu_rq(cpu)->curr); +		if (p->flags & PF_KTHREAD) +			continue; + +		__cpumask_set_cpu(cpu, tmpmask);  	} +	rcu_read_unlock(); + +	preempt_disable(); +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1); +	preempt_enable(); + +	free_cpumask_var(tmpmask);  	cpus_read_unlock();  	/* @@ -101,22 +132,22 @@ static int membarrier_global_expedited(void)  static int membarrier_private_expedited(int flags)  {  	int cpu; -	bool fallback = false;  	cpumask_var_t tmpmask; +	struct mm_struct *mm = current->mm;  	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {  		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))  			return -EINVAL; -		if (!(atomic_read(¤t->mm->membarrier_state) & +		if (!(atomic_read(&mm->membarrier_state) &  		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))  			return -EPERM;  	} else { -		if (!(atomic_read(¤t->mm->membarrier_state) & +		if (!(atomic_read(&mm->membarrier_state) &  		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))  			return -EPERM;  	} -	if (num_online_cpus() == 1) +	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)  		return 0;  	/* @@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags)  	 */  	smp_mb();	/* system call entry is not a mb. */ -	/* -	 * Expedited membarrier commands guarantee that they won't -	 * block, hence the GFP_NOWAIT allocation flag and fallback -	 * implementation. -	 */ -	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { -		/* Fallback for OOM. */ -		fallback = true; -	} +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) +		return -ENOMEM;  	cpus_read_lock(); +	rcu_read_lock();  	for_each_online_cpu(cpu) {  		struct task_struct *p; @@ -150,21 +175,17 @@ static int membarrier_private_expedited(int flags)  		if (cpu == raw_smp_processor_id())  			continue;  		rcu_read_lock(); -		p = task_rcu_dereference(&cpu_rq(cpu)->curr); -		if (p && p->mm == current->mm) { -			if (!fallback) -				__cpumask_set_cpu(cpu, tmpmask); -			else -				smp_call_function_single(cpu, ipi_mb, NULL, 1); -		} -		rcu_read_unlock(); -	} -	if (!fallback) { -		preempt_disable(); -		smp_call_function_many(tmpmask, ipi_mb, NULL, 1); -		preempt_enable(); -		free_cpumask_var(tmpmask); +		p = rcu_dereference(cpu_rq(cpu)->curr); +		if (p && p->mm == mm) +			__cpumask_set_cpu(cpu, tmpmask);  	} +	rcu_read_unlock(); + +	preempt_disable(); +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1); +	preempt_enable(); + +	free_cpumask_var(tmpmask);  	cpus_read_unlock();  	/* @@ -177,32 +198,78 @@ static int membarrier_private_expedited(int flags)  	return 0;  } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ +	int membarrier_state = atomic_read(&mm->membarrier_state); +	cpumask_var_t tmpmask; +	int cpu; + +	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { +		this_cpu_write(runqueues.membarrier_state, membarrier_state); + +		/* +		 * For single mm user, we can simply issue a memory barrier +		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the +		 * mm and in the current runqueue to guarantee that no memory +		 * access following registration is reordered before +		 * registration. +		 */ +		smp_mb(); +		return 0; +	} + +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) +		return -ENOMEM; + +	/* +	 * For mm with multiple users, we need to ensure all future +	 * scheduler executions will observe @mm's new membarrier +	 * state. +	 */ +	synchronize_rcu(); + +	/* +	 * For each cpu runqueue, if the task's mm match @mm, ensure that all +	 * @mm's membarrier state set bits are also set in in the runqueue's +	 * membarrier state. This ensures that a runqueue scheduling +	 * between threads which are users of @mm has its membarrier state +	 * updated. +	 */ +	cpus_read_lock(); +	rcu_read_lock(); +	for_each_online_cpu(cpu) { +		struct rq *rq = cpu_rq(cpu); +		struct task_struct *p; + +		p = rcu_dereference(rq->curr); +		if (p && p->mm == mm) +			__cpumask_set_cpu(cpu, tmpmask); +	} +	rcu_read_unlock(); + +	preempt_disable(); +	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); +	preempt_enable(); + +	free_cpumask_var(tmpmask); +	cpus_read_unlock(); + +	return 0; +} +  static int membarrier_register_global_expedited(void)  {  	struct task_struct *p = current;  	struct mm_struct *mm = p->mm; +	int ret;  	if (atomic_read(&mm->membarrier_state) &  	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)  		return 0;  	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); -	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { -		/* -		 * For single mm user, single threaded process, we can -		 * simply issue a memory barrier after setting -		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that -		 * no memory access following registration is reordered -		 * before registration. -		 */ -		smp_mb(); -	} else { -		/* -		 * For multi-mm user threads, we need to ensure all -		 * future scheduler executions will observe the new -		 * thread flag state for this mm. -		 */ -		synchronize_rcu(); -	} +	ret = sync_runqueues_membarrier_state(mm); +	if (ret) +		return ret;  	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,  		  &mm->membarrier_state); @@ -213,12 +280,15 @@ static int membarrier_register_private_expedited(int flags)  {  	struct task_struct *p = current;  	struct mm_struct *mm = p->mm; -	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; +	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, +	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, +	    ret;  	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {  		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))  			return -EINVAL; -		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; +		ready_state = +			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;  	}  	/* @@ -226,20 +296,15 @@ static int membarrier_register_private_expedited(int flags)  	 * groups, which use the same mm. (CLONE_VM but not  	 * CLONE_THREAD).  	 */ -	if (atomic_read(&mm->membarrier_state) & state) +	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)  		return 0; -	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);  	if (flags & MEMBARRIER_FLAG_SYNC_CORE) -		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, -			  &mm->membarrier_state); -	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { -		/* -		 * Ensure all future scheduler executions will observe the -		 * new thread flag state for this process. -		 */ -		synchronize_rcu(); -	} -	atomic_or(state, &mm->membarrier_state); +		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; +	atomic_or(set_state, &mm->membarrier_state); +	ret = sync_runqueues_membarrier_state(mm); +	if (ret) +		return ret; +	atomic_or(ready_state, &mm->membarrier_state);  	return 0;  } @@ -253,8 +318,10 @@ static int membarrier_register_private_expedited(int flags)   * command specified does not exist, not available on the running   * kernel, or if the command argument is invalid, this system call   * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call.   *   * All memory accesses performed in program order from each targeted thread   * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 6e52b67b420e..517e3719027e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1198,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,  	if (static_branch_likely(&psi_disabled))  		return -EOPNOTSUPP; -	buf_size = min(nbytes, (sizeof(buf) - 1)); +	buf_size = min(nbytes, sizeof(buf));  	if (copy_from_user(buf, user_buf, buf_size))  		return -EFAULT; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..ebaa4e619684 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)  	raw_spin_lock_init(&rt_b->rt_runtime_lock); -	hrtimer_init(&rt_b->rt_period_timer, -			CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, +		     HRTIMER_MODE_REL_HARD);  	rt_b->rt_period_timer.function = sched_rt_period_timer;  } @@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  		 * to update the period.  		 */  		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); -		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); +		hrtimer_start_expires(&rt_b->rt_period_timer, +				      HRTIMER_MODE_ABS_PINNED_HARD);  	}  	raw_spin_unlock(&rt_b->rt_runtime_lock);  } @@ -1498,12 +1499,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag  #endif  } -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)  {  	p->se.exec_start = rq_clock_task(rq);  	/* The running task is never eligible for pushing */  	dequeue_pushable_task(rq, p); + +	/* +	 * If prev task was rt, put_prev_task() has already updated the +	 * utilization. We only care of the case where we start to schedule a +	 * rt task +	 */ +	if (rq->curr->sched_class != &rt_sched_class) +		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + +	rt_queue_push_tasks(rq);  }  static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -1543,56 +1554,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  	struct task_struct *p;  	struct rt_rq *rt_rq = &rq->rt; -	if (need_pull_rt_task(rq, prev)) { -		/* -		 * This is OK, because current is on_cpu, which avoids it being -		 * picked for load-balance and preemption/IRQs are still -		 * disabled avoiding further scheduler activity on it and we're -		 * being very careful to re-start the picking loop. -		 */ -		rq_unpin_lock(rq, rf); -		pull_rt_task(rq); -		rq_repin_lock(rq, rf); -		/* -		 * pull_rt_task() can drop (and re-acquire) rq->lock; this -		 * means a dl or stop task can slip in, in which case we need -		 * to re-start task selection. -		 */ -		if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || -			     rq->dl.dl_nr_running)) -			return RETRY_TASK; -	} - -	/* -	 * We may dequeue prev's rt_rq in put_prev_task(). -	 * So, we update time before rt_queued check. -	 */ -	if (prev->sched_class == &rt_sched_class) -		update_curr_rt(rq); +	WARN_ON_ONCE(prev || rf);  	if (!rt_rq->rt_queued)  		return NULL; -	put_prev_task(rq, prev); -  	p = _pick_next_task_rt(rq); -	set_next_task(rq, p); - -	rt_queue_push_tasks(rq); - -	/* -	 * If prev task was rt, put_prev_task() has already updated the -	 * utilization. We only care of the case where we start to schedule a -	 * rt task -	 */ -	if (rq->curr->sched_class != &rt_sched_class) -		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); +	set_next_task_rt(rq, p);  	return p;  } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)  {  	update_curr_rt(rq); @@ -1604,6 +1578,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)  	 */  	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p); + +	if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { +		/* +		 * This is OK, because current is on_cpu, which avoids it being +		 * picked for load-balance and preemption/IRQs are still +		 * disabled avoiding further scheduler activity on it and we've +		 * not yet started the picking loop. +		 */ +		rq_unpin_lock(rq, rf); +		pull_rt_task(rq); +		rq_repin_lock(rq, rf); +	}  }  #ifdef CONFIG_SMP @@ -2304,8 +2290,10 @@ static void watchdog(struct rq *rq, struct task_struct *p)  		}  		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); -		if (p->rt.timeout > next) -			p->cputime_expires.sched_exp = p->se.sum_exec_runtime; +		if (p->rt.timeout > next) { +			posix_cputimers_rt_watchdog(&p->posix_cputimers, +						    p->se.sum_exec_runtime); +		}  	}  }  #else @@ -2354,11 +2342,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	}  } -static void set_curr_task_rt(struct rq *rq) -{ -	set_next_task(rq, rq->curr); -} -  static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)  {  	/* @@ -2380,6 +2363,7 @@ const struct sched_class rt_sched_class = {  	.pick_next_task		= pick_next_task_rt,  	.put_prev_task		= put_prev_task_rt, +	.set_next_task          = set_next_task_rt,  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_rt, @@ -2391,7 +2375,6 @@ const struct sched_class rt_sched_class = {  	.switched_from		= switched_from_rt,  #endif -	.set_curr_task          = set_curr_task_rt,  	.task_tick		= task_tick_rt,  	.get_rr_interval	= get_rr_interval_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -335,8 +335,6 @@ struct cfs_bandwidth {  	u64			quota;  	u64			runtime;  	s64			hierarchical_quota; -	u64			runtime_expires; -	int			expires_seq;  	u8			idle;  	u8			period_active; @@ -393,6 +391,16 @@ struct task_group {  #endif  	struct cfs_bandwidth	cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP +	/* The two decimal precision [%] value requested from user-space */ +	unsigned int		uclamp_pct[UCLAMP_CNT]; +	/* Clamp values requested for a task group */ +	struct uclamp_se	uclamp_req[UCLAMP_CNT]; +	/* Effective clamp values used for a task group */ +	struct uclamp_se	uclamp[UCLAMP_CNT]; +#endif +  };  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -483,7 +491,8 @@ struct cfs_rq {  	struct load_weight	load;  	unsigned long		runnable_weight;  	unsigned int		nr_running; -	unsigned int		h_nr_running; +	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */ +	unsigned int		idle_h_nr_running; /* SCHED_IDLE */  	u64			exec_clock;  	u64			min_vruntime; @@ -556,8 +565,6 @@ struct cfs_rq {  #ifdef CONFIG_CFS_BANDWIDTH  	int			runtime_enabled; -	int			expires_seq; -	u64			runtime_expires;  	s64			runtime_remaining;  	u64			throttled_clock; @@ -777,9 +784,6 @@ struct root_domain {  	struct perf_domain __rcu *pd;  }; -extern struct root_domain def_root_domain; -extern struct mutex sched_domains_mutex; -  extern void init_defrootdomain(void);  extern int sched_init_domains(const struct cpumask *cpu_map);  extern void rq_attach_root(struct rq *rq, struct root_domain *rd); @@ -907,6 +911,10 @@ struct rq {  	atomic_t		nr_iowait; +#ifdef CONFIG_MEMBARRIER +	int membarrier_state; +#endif +  #ifdef CONFIG_SMP  	struct root_domain		*rd;  	struct sched_domain __rcu	*sd; @@ -1261,16 +1269,18 @@ enum numa_topology_type {  extern enum numa_topology_type sched_numa_topology_type;  extern int sched_max_numa_distance;  extern bool find_numa_distance(int distance); -#endif - -#ifdef CONFIG_NUMA  extern void sched_init_numa(void);  extern void sched_domains_numa_masks_set(unsigned int cpu);  extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);  #else  static inline void sched_init_numa(void) { }  static inline void sched_domains_numa_masks_set(unsigned int cpu) { }  static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ +	return nr_cpu_ids; +}  #endif  #ifdef CONFIG_NUMA_BALANCING @@ -1449,10 +1459,14 @@ static inline void unregister_sched_domain_sysctl(void)  }  #endif +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +  #else  static inline void sched_ttwu_pending(void) { } +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } +  #endif /* CONFIG_SMP */  #include "stats.h" @@ -1700,17 +1714,21 @@ struct sched_class {  	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);  	/* -	 * It is the responsibility of the pick_next_task() method that will -	 * return the next task to call put_prev_task() on the @prev task or -	 * something equivalent. +	 * Both @prev and @rf are optional and may be NULL, in which case the +	 * caller must already have invoked put_prev_task(rq, prev, rf). +	 * +	 * Otherwise it is the responsibility of the pick_next_task() to call +	 * put_prev_task() on the @prev task or something equivalent, IFF it +	 * returns a next task.  	 * -	 * May return RETRY_TASK when it finds a higher prio class has runnable -	 * tasks. +	 * In that case (@rf != NULL) it may return RETRY_TASK when it finds a +	 * higher prio class has runnable tasks.  	 */  	struct task_struct * (*pick_next_task)(struct rq *rq,  					       struct task_struct *prev,  					       struct rq_flags *rf); -	void (*put_prev_task)(struct rq *rq, struct task_struct *p); +	void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); +	void (*set_next_task)(struct rq *rq, struct task_struct *p);  #ifdef CONFIG_SMP  	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); @@ -1725,7 +1743,6 @@ struct sched_class {  	void (*rq_offline)(struct rq *rq);  #endif -	void (*set_curr_task)(struct rq *rq);  	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);  	void (*task_fork)(struct task_struct *p);  	void (*task_dead)(struct task_struct *p); @@ -1755,12 +1772,14 @@ struct sched_class {  static inline void put_prev_task(struct rq *rq, struct task_struct *prev)  { -	prev->sched_class->put_prev_task(rq, prev); +	WARN_ON_ONCE(rq->curr != prev); +	prev->sched_class->put_prev_task(rq, prev, NULL);  } -static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +static inline void set_next_task(struct rq *rq, struct task_struct *next)  { -	curr->sched_class->set_curr_task(rq); +	WARN_ON_ONCE(rq->curr != next); +	next->sched_class->set_next_task(rq, next);  }  #ifdef CONFIG_SMP @@ -1943,7 +1962,7 @@ unsigned long arch_scale_freq_capacity(int cpu)  #endif  #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION  static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1995,7 +2014,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)  	return ret;  } -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */  /*   * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2266,7 +2285,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}  #endif /* CONFIG_CPU_FREQ */  #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);  static __always_inline  unsigned int uclamp_util_with(struct rq *rq, unsigned int util, @@ -2423,3 +2442,33 @@ static inline bool sched_energy_enabled(void)  static inline bool sched_energy_enabled(void) { return false; }  #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, +					struct mm_struct *prev_mm, +					struct mm_struct *next_mm) +{ +	int membarrier_state; + +	if (prev_mm == next_mm) +		return; + +	membarrier_state = atomic_read(&next_mm->membarrier_state); +	if (READ_ONCE(rq->membarrier_state) == membarrier_state) +		return; + +	WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, +					struct mm_struct *prev_mm, +					struct mm_struct *next_mm) +{ +} +#endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index aa0de240fb41..ba683fe81a6e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)  {  	unsigned long long now = rq_clock(rq), delta = 0; -	if (unlikely(sched_info_on())) +	if (sched_info_on()) {  		if (t->sched_info.last_queued)  			delta = now - t->sched_info.last_queued; +	}  	sched_info_reset_dequeued(t);  	t->sched_info.run_delay += delta; @@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)   */  static inline void sched_info_queued(struct rq *rq, struct task_struct *t)  { -	if (unlikely(sched_info_on())) { +	if (sched_info_on()) {  		if (!t->sched_info.last_queued)  			t->sched_info.last_queued = rq_clock(rq);  	} @@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct  static inline void  sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)  { -	if (unlikely(sched_info_on())) +	if (sched_info_on())  		__sched_info_switch(rq, prev, next);  } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..7e1cee4e65b2 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)  	/* we're never preempted */  } +static void set_next_task_stop(struct rq *rq, struct task_struct *stop) +{ +	stop->se.exec_start = rq_clock_task(rq); +} +  static struct task_struct *  pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  {  	struct task_struct *stop = rq->stop; +	WARN_ON_ONCE(prev || rf); +  	if (!stop || !task_on_rq_queued(stop))  		return NULL; -	put_prev_task(rq, prev); - -	stop->se.exec_start = rq_clock_task(rq); +	set_next_task_stop(rq, stop);  	return stop;  } @@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq)  	BUG(); /* the stop task should never yield, its pointless. */  } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  {  	struct task_struct *curr = rq->curr;  	u64 delta_exec; @@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)  {  } -static void set_curr_task_stop(struct rq *rq) -{ -	struct task_struct *stop = rq->stop; - -	stop->se.exec_start = rq_clock_task(rq); -} -  static void switched_to_stop(struct rq *rq, struct task_struct *p)  {  	BUG(); /* its impossible to change to this class */ @@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = {  	.pick_next_task		= pick_next_task_stop,  	.put_prev_task		= put_prev_task_stop, +	.set_next_task          = set_next_task_stop,  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_stop,  	.set_cpus_allowed	= set_cpus_allowed_common,  #endif -	.set_curr_task          = set_curr_task_stop,  	.task_tick		= task_tick_stop,  	.get_rr_interval	= get_rr_interval_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f751ce0b783e..b5667a273bf6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1284,6 +1284,7 @@ static int			sched_domains_curr_level;  int				sched_max_numa_distance;  static int			*sched_domains_numa_distance;  static struct cpumask		***sched_domains_numa_masks; +int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;  #endif  /* @@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,  		sd->flags &= ~SD_PREFER_SIBLING;  		sd->flags |= SD_SERIALIZE; -		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { +		if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {  			sd->flags &= ~(SD_BALANCE_EXEC |  				       SD_BALANCE_FORK |  				       SD_WAKE_AFFINE); @@ -1724,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)  	}  } +/* + * sched_numa_find_closest() - given the NUMA topology, find the cpu + *                             closest to @cpu from @cpumask. + * cpumask: cpumask to find a cpu from + * cpu: cpu to be close to + * + * returns: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ +	int i, j = cpu_to_node(cpu); + +	for (i = 0; i < sched_domains_numa_levels; i++) { +		cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); +		if (cpu < nr_cpu_ids) +			return cpu; +	} +	return nr_cpu_ids; +} +  #endif /* CONFIG_NUMA */  static int __sdt_alloc(const struct cpumask *cpu_map) @@ -2149,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,   * ndoms_new == 0 is a special case for destroying existing domains,   * and it will not create the default domain.   * - * Call with hotplug lock held + * Call with hotplug lock and sched_domains_mutex held   */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -			     struct sched_domain_attr *dattr_new) +void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +				    struct sched_domain_attr *dattr_new)  {  	bool __maybe_unused has_eas = false;  	int i, j, n;  	int new_topology; -	mutex_lock(&sched_domains_mutex); +	lockdep_assert_held(&sched_domains_mutex);  	/* Always unregister in case we don't destroy any domains: */  	unregister_sched_domain_sysctl(); @@ -2183,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],  	for (i = 0; i < ndoms_cur; i++) {  		for (j = 0; j < n && !new_topology; j++) {  			if (cpumask_equal(doms_cur[i], doms_new[j]) && -			    dattrs_equal(dattr_cur, i, dattr_new, j)) +			    dattrs_equal(dattr_cur, i, dattr_new, j)) { +				struct root_domain *rd; + +				/* +				 * This domain won't be destroyed and as such +				 * its dl_bw->total_bw needs to be cleared.  It +				 * will be recomputed in function +				 * update_tasks_root_domain(). +				 */ +				rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; +				dl_clear_root_domain(rd);  				goto match1; +			}  		}  		/* No match - a current sched domain not in new doms_new[] */  		detach_destroy_domains(doms_cur[i]); @@ -2241,6 +2273,15 @@ match3:  	ndoms_cur = ndoms_new;  	register_sched_domain_sysctl(); +} +/* + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], +			     struct sched_domain_attr *dattr_new) +{ +	mutex_lock(&sched_domains_mutex); +	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);  	mutex_unlock(&sched_domains_mutex);  } diff --git a/kernel/signal.c b/kernel/signal.c index 534fec266a33..c4da1ef56fdf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3678,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)  static struct pid *pidfd_to_pid(const struct file *file)  { -	if (file->f_op == &pidfd_fops) -		return file->private_data; +	struct pid *pid; + +	pid = pidfd_pid(file); +	if (!IS_ERR(pid)) +		return pid;  	return tgid_pidfd_to_pid(file);  } diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f5440abb7532..6d1f68b7e528 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -20,7 +20,7 @@   * @nr_entries:	Number of entries in the storage array   * @spaces:	Number of leading spaces to print   */ -void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +void stack_trace_print(const unsigned long *entries, unsigned int nr_entries,  		       int spaces)  {  	unsigned int i; @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print);   *   * Return: Number of bytes printed.   */ -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,  			unsigned int nr_entries, int spaces)  {  	unsigned int generated, i, total = 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4f83f7bdf86..c7031a22aa7b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,  	 */  	preempt_disable();  	stop_cpus_in_progress = true; +	barrier();  	for_each_cpu(cpu, cpumask) {  		work = &per_cpu(cpu_stopper.stop_work, cpu);  		work->fn = fn; @@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,  		if (cpu_stop_queue_work(cpu, work))  			queued = true;  	} +	barrier();  	stop_cpus_in_progress = false;  	preempt_enable(); diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..a611d1d58c7d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -103,12 +103,6 @@  #ifndef SET_TSC_CTL  # define SET_TSC_CTL(a)		(-EINVAL)  #endif -#ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT()	(-EINVAL) -#endif -#ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT()	(-EINVAL) -#endif  #ifndef GET_FP_MODE  # define GET_FP_MODE(a)		(-EINVAL)  #endif @@ -124,6 +118,12 @@  #ifndef PAC_RESET_KEYS  # define PAC_RESET_KEYS(a, b)	(-EINVAL)  #endif +#ifndef SET_TAGGED_ADDR_CTRL +# define SET_TAGGED_ADDR_CTRL(a)	(-EINVAL) +#endif +#ifndef GET_TAGGED_ADDR_CTRL +# define GET_TAGGED_ADDR_CTRL()		(-EINVAL) +#endif  /*   * this is where the system-wide overflow UID and GID are defined, for @@ -1557,15 +1557,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,  			retval = -EPERM;  		if (!retval)  			retval = security_task_setrlimit(tsk, resource, new_rlim); -		if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { -			/* -			 * The caller is asking for an immediate RLIMIT_CPU -			 * expiry.  But we use the zero value to mean "it was -			 * never set".  So let's cheat and make it one second -			 * instead -			 */ -			new_rlim->rlim_cur = 1; -		}  	}  	if (!retval) {  		if (old_rlim) @@ -1576,10 +1567,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,  	task_unlock(tsk->group_leader);  	/* -	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error -	 * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a -	 * very long-standing error, and fixing it now risks breakage of -	 * applications, so we live with it +	 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not +	 * infite. In case of RLIM_INFINITY the posix CPU timer code +	 * ignores the rlimit.  	 */  	 if (!retval && new_rlim && resource == RLIMIT_CPU &&  	     new_rlim->rlim_cur != RLIM_INFINITY && @@ -2456,15 +2446,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		up_write(&me->mm->mmap_sem);  		break;  	case PR_MPX_ENABLE_MANAGEMENT: -		if (arg2 || arg3 || arg4 || arg5) -			return -EINVAL; -		error = MPX_ENABLE_MANAGEMENT(); -		break;  	case PR_MPX_DISABLE_MANAGEMENT: -		if (arg2 || arg3 || arg4 || arg5) -			return -EINVAL; -		error = MPX_DISABLE_MANAGEMENT(); -		break; +		/* No longer implemented: */ +		return -EINVAL;  	case PR_SET_FP_MODE:  		error = SET_FP_MODE(me, arg2);  		break; @@ -2492,6 +2476,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  			return -EINVAL;  		error = PAC_RESET_KEYS(me, arg2);  		break; +	case PR_SET_TAGGED_ADDR_CTRL: +		if (arg3 || arg4 || arg5) +			return -EINVAL; +		error = SET_TAGGED_ADDR_CTRL(arg2); +		break; +	case PR_GET_TAGGED_ADDR_CTRL: +		if (arg2 || arg3 || arg4 || arg5) +			return -EINVAL; +		error = GET_TAGGED_ADDR_CTRL(); +		break;  	default:  		error = -EINVAL;  		break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 078950d9605b..00fcea236eba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[];  extern struct ctl_table firmware_config_table[];  #endif -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ +    defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)  int sysctl_legacy_va_layout;  #endif @@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = {  		.proc_handler	= proc_dointvec,  		.extra1		= SYSCTL_ZERO,  	}, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ +    defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)  	{  		.procname	= "legacy_va_layout",  		.data		= &sysctl_legacy_va_layout, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..451f9d05ccfe 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -97,7 +97,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,  	if (!device_may_wakeup(rtc->dev.parent))  		return -1; -	__ws = wakeup_source_register("alarmtimer"); +	__ws = wakeup_source_register(dev, "alarmtimer");  	spin_lock_irqsave(&rtcdev_lock, flags);  	if (!rtcdev) { @@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm)  		int ret = alarm_try_to_cancel(alarm);  		if (ret >= 0)  			return ret; -		cpu_relax(); +		hrtimer_cancel_wait_running(&alarm->timer);  	}  }  EXPORT_SYMBOL_GPL(alarm_cancel); @@ -606,6 +606,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr)  }  /** + * alarm_timer_wait_running - Posix timer callback to wait for a timer + * @timr:	Pointer to the posixtimer data struct + * + * Called from the core code when timer cancel detected that the callback + * is running. @timr is unlocked and rcu read lock is held to prevent it + * from being freed. + */ +static void alarm_timer_wait_running(struct k_itimer *timr) +{ +	hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer); +} + +/**   * alarm_timer_arm - Posix timer callback to arm a timer   * @timr:	Pointer to the posixtimer data struct   * @expires:	The new expiry time @@ -672,7 +685,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)  	enum  alarmtimer_type type;  	if (!alarmtimer_get_rtcdev()) -		return -ENOTSUPP; +		return -EOPNOTSUPP;  	if (!capable(CAP_WAKE_ALARM))  		return -EPERM; @@ -790,7 +803,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,  	int ret = 0;  	if (!alarmtimer_get_rtcdev()) -		return -ENOTSUPP; +		return -EOPNOTSUPP;  	if (flags & ~TIMER_ABSTIME)  		return -EINVAL; @@ -834,6 +847,7 @@ const struct k_clock alarm_clock = {  	.timer_forward		= alarm_timer_forward,  	.timer_remaining	= alarm_timer_remaining,  	.timer_try_to_cancel	= alarm_timer_try_to_cancel, +	.timer_wait_running	= alarm_timer_wait_running,  	.nsleep			= alarm_timer_nsleep,  };  #endif /* CONFIG_POSIX_TIMERS */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5ee77f1a8a92..0d4dc241c0fb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = {  #define migration_base	migration_cpu_base.clock_base[0] +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ +	return base == &migration_base; +} +  /*   * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock   * means that all timers which are tied to this base via timer->base are @@ -264,6 +269,11 @@ again:  #else /* CONFIG_SMP */ +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ +	return false; +} +  static inline struct hrtimer_clock_base *  lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)  { @@ -427,6 +437,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,  }  EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, +				   clockid_t clock_id, enum hrtimer_mode mode); + +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, +				   clockid_t clock_id, enum hrtimer_mode mode) +{ +	debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); +	__hrtimer_init_sleeper(sl, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); +  void destroy_hrtimer_on_stack(struct hrtimer *timer)  {  	debug_object_free(timer, &hrtimer_debug_descr); @@ -1096,9 +1117,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,  	/*  	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft -	 * match. +	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard +	 * expiry mode because unmarked timers are moved to softirq expiry.  	 */ -	WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); +	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); +	else +		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);  	base = lock_hrtimer_base(timer, &flags); @@ -1147,6 +1172,93 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)  }  EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); +#ifdef CONFIG_PREEMPT_RT +static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) +{ +	spin_lock_init(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) +{ +	spin_lock(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) +{ +	spin_unlock(&base->softirq_expiry_lock); +} + +/* + * The counterpart to hrtimer_cancel_wait_running(). + * + * If there is a waiter for cpu_base->expiry_lock, then it was waiting for + * the timer callback to finish. Drop expiry_lock and reaquire it. That + * allows the waiter to acquire the lock and make progress. + */ +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, +				      unsigned long flags) +{ +	if (atomic_read(&cpu_base->timer_waiters)) { +		raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +		spin_unlock(&cpu_base->softirq_expiry_lock); +		spin_lock(&cpu_base->softirq_expiry_lock); +		raw_spin_lock_irq(&cpu_base->lock); +	} +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion: if the soft irq thread is preempted + * in the middle of a timer callback, then calling del_timer_sync() can + * lead to two issues: + * + *  - If the caller is on a remote CPU then it has to spin wait for the timer + *    handler to complete. This can result in unbound priority inversion. + * + *  - If the caller originates from the task which preempted the timer + *    handler on the same CPU, then spin waiting for the timer handler to + *    complete is never going to end. + */ +void hrtimer_cancel_wait_running(const struct hrtimer *timer) +{ +	/* Lockless read. Prevent the compiler from reloading it below */ +	struct hrtimer_clock_base *base = READ_ONCE(timer->base); + +	/* +	 * Just relax if the timer expires in hard interrupt context or if +	 * it is currently on the migration base. +	 */ +	if (!timer->is_soft || is_migration_base(base)) { +		cpu_relax(); +		return; +	} + +	/* +	 * Mark the base as contended and grab the expiry lock, which is +	 * held by the softirq across the timer callback. Drop the lock +	 * immediately so the softirq can expire the next timer. In theory +	 * the timer could already be running again, but that's more than +	 * unlikely and just causes another wait loop. +	 */ +	atomic_inc(&base->cpu_base->timer_waiters); +	spin_lock_bh(&base->cpu_base->softirq_expiry_lock); +	atomic_dec(&base->cpu_base->timer_waiters); +	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); +} +#else +static inline void +hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, +					     unsigned long flags) { } +#endif +  /**   * hrtimer_cancel - cancel a timer and wait for the handler to finish.   * @timer:	the timer to be cancelled @@ -1157,13 +1269,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);   */  int hrtimer_cancel(struct hrtimer *timer)  { -	for (;;) { -		int ret = hrtimer_try_to_cancel(timer); +	int ret; -		if (ret >= 0) -			return ret; -		cpu_relax(); -	} +	do { +		ret = hrtimer_try_to_cancel(timer); + +		if (ret < 0) +			hrtimer_cancel_wait_running(timer); +	} while (ret < 0); +	return ret;  }  EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1260,8 +1374,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,  			   enum hrtimer_mode mode)  {  	bool softtimer = !!(mode & HRTIMER_MODE_SOFT); -	int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;  	struct hrtimer_cpu_base *cpu_base; +	int base; + +	/* +	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitely +	 * marked for hard interrupt expiry mode are moved into soft +	 * interrupt context for latency reasons and because the callbacks +	 * can invoke functions which might sleep on RT, e.g. spin_lock(). +	 */ +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) +		softtimer = true;  	memset(timer, 0, sizeof(struct hrtimer)); @@ -1275,8 +1398,10 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,  	if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)  		clock_id = CLOCK_MONOTONIC; +	base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;  	base += hrtimer_clockid_to_base(clock_id);  	timer->is_soft = softtimer; +	timer->is_hard = !softtimer;  	timer->base = &cpu_base->clock_base[base];  	timerqueue_init(&timer->node);  } @@ -1449,6 +1574,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,  				break;  			__run_hrtimer(cpu_base, base, timer, &basenow, flags); +			if (active_mask == HRTIMER_ACTIVE_SOFT) +				hrtimer_sync_wait_running(cpu_base, flags);  		}  	}  } @@ -1459,6 +1586,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)  	unsigned long flags;  	ktime_t now; +	hrtimer_cpu_base_lock_expiry(cpu_base);  	raw_spin_lock_irqsave(&cpu_base->lock, flags);  	now = hrtimer_update_base(cpu_base); @@ -1468,6 +1596,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)  	hrtimer_update_softirq_timer(cpu_base, true);  	raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +	hrtimer_cpu_base_unlock_expiry(cpu_base);  }  #ifdef CONFIG_HIGH_RES_TIMERS @@ -1639,10 +1768,75 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)  	return HRTIMER_NORESTART;  } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +/** + * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer + * @sl:		sleeper to be started + * @mode:	timer mode abs/rel + * + * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers + * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) + */ +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, +				   enum hrtimer_mode mode) +{ +	/* +	 * Make the enqueue delivery mode check work on RT. If the sleeper +	 * was initialized for hard interrupt delivery, force the mode bit. +	 * This is a special case for hrtimer_sleepers because +	 * hrtimer_init_sleeper() determines the delivery mode on RT so the +	 * fiddling with this decision is avoided at the call sites. +	 */ +	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) +		mode |= HRTIMER_MODE_HARD; + +	hrtimer_start_expires(&sl->timer, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); + +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, +				   clockid_t clock_id, enum hrtimer_mode mode)  { +	/* +	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitely +	 * marked for hard interrupt expiry mode are moved into soft +	 * interrupt context either for latency reasons or because the +	 * hrtimer callback takes regular spinlocks or invokes other +	 * functions which are not suitable for hard interrupt context on +	 * PREEMPT_RT. +	 * +	 * The hrtimer_sleeper callback is RT compatible in hard interrupt +	 * context, but there is a latency concern: Untrusted userspace can +	 * spawn many threads which arm timers for the same expiry time on +	 * the same CPU. That causes a latency spike due to the wakeup of +	 * a gazillion threads. +	 * +	 * OTOH, priviledged real-time user space applications rely on the +	 * low latency of hard interrupt wakeups. If the current task is in +	 * a real-time scheduling class, mark the mode for hard interrupt +	 * expiry. +	 */ +	if (IS_ENABLED(CONFIG_PREEMPT_RT)) { +		if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) +			mode |= HRTIMER_MODE_HARD; +	} + +	__hrtimer_init(&sl->timer, clock_id, mode);  	sl->timer.function = hrtimer_wakeup; -	sl->task = task; +	sl->task = current; +} + +/** + * hrtimer_init_sleeper - initialize sleeper to the given clock + * @sl:		sleeper to be initialized + * @clock_id:	the clock to be used + * @mode:	timer mode abs/rel + */ +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, +			  enum hrtimer_mode mode) +{ +	debug_init(&sl->timer, clock_id, mode); +	__hrtimer_init_sleeper(sl, clock_id, mode); +  }  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); @@ -1669,11 +1863,9 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod  {  	struct restart_block *restart; -	hrtimer_init_sleeper(t, current); -  	do {  		set_current_state(TASK_INTERRUPTIBLE); -		hrtimer_start_expires(&t->timer, mode); +		hrtimer_sleeper_start_expires(t, mode);  		if (likely(t->task))  			freezable_schedule(); @@ -1707,10 +1899,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)  	struct hrtimer_sleeper t;  	int ret; -	hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, -				HRTIMER_MODE_ABS); +	hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, +				      HRTIMER_MODE_ABS);  	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); -  	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);  	destroy_hrtimer_on_stack(&t.timer);  	return ret; @@ -1728,7 +1919,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,  	if (dl_task(current) || rt_task(current))  		slack = 0; -	hrtimer_init_on_stack(&t.timer, clockid, mode); +	hrtimer_init_sleeper_on_stack(&t, clockid, mode);  	hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);  	ret = do_nanosleep(&t, mode);  	if (ret != -ERESTART_RESTARTBLOCK) @@ -1809,6 +2000,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)  	cpu_base->softirq_next_timer = NULL;  	cpu_base->expires_next = KTIME_MAX;  	cpu_base->softirq_expires_next = KTIME_MAX; +	hrtimer_cpu_base_init_expiry_lock(cpu_base);  	return 0;  } @@ -1927,12 +2119,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,  		return -EINTR;  	} -	hrtimer_init_on_stack(&t.timer, clock_id, mode); +	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);  	hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - -	hrtimer_init_sleeper(&t, current); - -	hrtimer_start_expires(&t.timer, mode); +	hrtimer_sleeper_start_expires(&t, mode);  	if (likely(t.task))  		schedule(); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 02068b2d5862..77f1e5635cc1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,  	val = it->expires;  	interval = it->incr;  	if (val) { -		struct task_cputime cputime; -		u64 t; +		u64 t, samples[CPUCLOCK_MAX]; -		thread_group_cputimer(tsk, &cputime); -		if (clock_id == CPUCLOCK_PROF) -			t = cputime.utime + cputime.stime; -		else -			/* CPUCLOCK_VIRT */ -			t = cputime.utime; +		thread_group_sample_cputime(tsk, samples); +		t = samples[clock_id];  		if (val < t)  			/* about to fire */ @@ -213,6 +208,7 @@ again:  		/* We are sharing ->siglock with it_real_fn() */  		if (hrtimer_try_to_cancel(timer) < 0) {  			spin_unlock_irq(&tsk->sighand->siglock); +			hrtimer_cancel_wait_running(timer);  			goto again;  		}  		expires = timeval_to_ktime(value->it_value); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a426f4e3125..92a431981b1c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -20,11 +20,20 @@  static void posix_cpu_timer_rearm(struct k_itimer *timer); +void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) +{ +	posix_cputimers_init(pct); +	if (cpu_limit != RLIM_INFINITY) { +		pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; +		pct->timers_active = true; +	} +} +  /*   * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->cputime_expires expiration cache if necessary. Needs - * siglock protection since other code may update expiration cache as - * well. + * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if + * necessary. Needs siglock protection since other code may update the + * expiration cache as well.   */  void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)  { @@ -35,46 +44,97 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)  	spin_unlock_irq(&task->sighand->siglock);  } -static int check_clock(const clockid_t which_clock) +/* + * Functions for validating access to tasks. + */ +static struct task_struct *lookup_task(const pid_t pid, bool thread, +				       bool gettime)  { -	int error = 0;  	struct task_struct *p; -	const pid_t pid = CPUCLOCK_PID(which_clock); - -	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) -		return -EINVAL; -	if (pid == 0) -		return 0; +	/* +	 * If the encoded PID is 0, then the timer is targeted at current +	 * or the process to which current belongs. +	 */ +	if (!pid) +		return thread ? current : current->group_leader; -	rcu_read_lock();  	p = find_task_by_vpid(pid); -	if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? -		   same_thread_group(p, current) : has_group_leader_pid(p))) { -		error = -EINVAL; +	if (!p) +		return p; + +	if (thread) +		return same_thread_group(p, current) ? p : NULL; + +	if (gettime) { +		/* +		 * For clock_gettime(PROCESS) the task does not need to be +		 * the actual group leader. tsk->sighand gives +		 * access to the group's clock. +		 * +		 * Timers need the group leader because they take a +		 * reference on it and store the task pointer until the +		 * timer is destroyed. +		 */ +		return (p == current || thread_group_leader(p)) ? p : NULL;  	} + +	/* +	 * For processes require that p is group leader. +	 */ +	return has_group_leader_pid(p) ? p : NULL; +} + +static struct task_struct *__get_task_for_clock(const clockid_t clock, +						bool getref, bool gettime) +{ +	const bool thread = !!CPUCLOCK_PERTHREAD(clock); +	const pid_t pid = CPUCLOCK_PID(clock); +	struct task_struct *p; + +	if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) +		return NULL; + +	rcu_read_lock(); +	p = lookup_task(pid, thread, gettime); +	if (p && getref) +		get_task_struct(p);  	rcu_read_unlock(); +	return p; +} -	return error; +static inline struct task_struct *get_task_for_clock(const clockid_t clock) +{ +	return __get_task_for_clock(clock, true, false); +} + +static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) +{ +	return __get_task_for_clock(clock, true, true); +} + +static inline int validate_clock_permissions(const clockid_t clock) +{ +	return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL;  }  /*   * Update expiry time from increment, and increase overrun count,   * given the current clock sample.   */ -static void bump_cpu_timer(struct k_itimer *timer, u64 now) +static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)  { +	u64 delta, incr, expires = timer->it.cpu.node.expires;  	int i; -	u64 delta, incr;  	if (!timer->it_interval) -		return; +		return expires; -	if (now < timer->it.cpu.expires) -		return; +	if (now < expires) +		return expires;  	incr = timer->it_interval; -	delta = now + incr - timer->it.cpu.expires; +	delta = now + incr - expires;  	/* Don't use (incr*2 < delta), incr*2 might overflow. */  	for (i = 0; incr < delta - incr; i++) @@ -84,48 +144,26 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)  		if (delta < incr)  			continue; -		timer->it.cpu.expires += incr; +		timer->it.cpu.node.expires += incr;  		timer->it_overrun += 1LL << i;  		delta -= incr;  	} +	return timer->it.cpu.node.expires;  } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime:	The struct to compare. - * - * Checks @cputime to see if all fields are zero.  Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) +/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ +static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)  { -	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) -		return 1; -	return 0; -} - -static inline u64 prof_ticks(struct task_struct *p) -{ -	u64 utime, stime; - -	task_cputime(p, &utime, &stime); - -	return utime + stime; -} -static inline u64 virt_ticks(struct task_struct *p) -{ -	u64 utime, stime; - -	task_cputime(p, &utime, &stime); - -	return utime; +	return !(~pct->bases[CPUCLOCK_PROF].nextevt | +		 ~pct->bases[CPUCLOCK_VIRT].nextevt | +		 ~pct->bases[CPUCLOCK_SCHED].nextevt);  }  static int  posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)  { -	int error = check_clock(which_clock); +	int error = validate_clock_permissions(which_clock); +  	if (!error) {  		tp->tv_sec = 0;  		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); @@ -142,42 +180,66 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)  }  static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) +posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)  { +	int error = validate_clock_permissions(clock); +  	/*  	 * You can never reset a CPU clock, but we check for other errors  	 * in the call before failing with EPERM.  	 */ -	int error = check_clock(which_clock); -	if (error == 0) { -		error = -EPERM; -	} -	return error; +	return error ? : -EPERM;  } -  /* - * Sample a per-thread clock for the given task. + * Sample a per-thread clock for the given task. clkid is validated.   */ -static int cpu_clock_sample(const clockid_t which_clock, -			    struct task_struct *p, u64 *sample) +static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)  { -	switch (CPUCLOCK_WHICH(which_clock)) { -	default: -		return -EINVAL; +	u64 utime, stime; + +	if (clkid == CPUCLOCK_SCHED) +		return task_sched_runtime(p); + +	task_cputime(p, &utime, &stime); + +	switch (clkid) {  	case CPUCLOCK_PROF: -		*sample = prof_ticks(p); -		break; +		return utime + stime;  	case CPUCLOCK_VIRT: -		*sample = virt_ticks(p); -		break; -	case CPUCLOCK_SCHED: -		*sample = task_sched_runtime(p); -		break; +		return utime; +	default: +		WARN_ON_ONCE(1);  	}  	return 0;  } +static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime) +{ +	samples[CPUCLOCK_PROF] = stime + utime; +	samples[CPUCLOCK_VIRT] = utime; +	samples[CPUCLOCK_SCHED] = rtime; +} + +static void task_sample_cputime(struct task_struct *p, u64 *samples) +{ +	u64 stime, utime; + +	task_cputime(p, &utime, &stime); +	store_samples(samples, stime, utime, p->se.sum_exec_runtime); +} + +static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +				       u64 *samples) +{ +	u64 stime, utime, rtime; + +	utime = atomic64_read(&at->utime); +	stime = atomic64_read(&at->stime); +	rtime = atomic64_read(&at->sum_exec_runtime); +	store_samples(samples, stime, utime, rtime); +} +  /*   * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg   * to avoid race conditions with concurrent updates to cputime. @@ -193,29 +255,56 @@ retry:  	}  } -static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, +			      struct task_cputime *sum)  {  	__update_gt_cputime(&cputime_atomic->utime, sum->utime);  	__update_gt_cputime(&cputime_atomic->stime, sum->stime);  	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);  } -/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ -static inline void sample_cputime_atomic(struct task_cputime *times, -					 struct task_cputime_atomic *atomic_times) +/** + * thread_group_sample_cputime - Sample cputime for a given task + * @tsk:	Task for which cputime needs to be started + * @iimes:	Storage for time samples + * + * Called from sys_getitimer() to calculate the expiry time of an active + * timer. That means group cputime accounting is already active. Called + * with task sighand lock held. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)  { -	times->utime = atomic64_read(&atomic_times->utime); -	times->stime = atomic64_read(&atomic_times->stime); -	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); +	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; +	struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + +	WARN_ON_ONCE(!pct->timers_active); + +	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);  } -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +/** + * thread_group_start_cputime - Start cputime and return a sample + * @tsk:	Task for which cputime needs to be started + * @samples:	Storage for time samples + * + * The thread group cputime accouting is avoided when there are no posix + * CPU timers armed. Before starting a timer it's required to check whether + * the time accounting is active. If not, a full update of the atomic + * accounting store needs to be done and the accounting enabled. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)  {  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; -	struct task_cputime sum; +	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;  	/* Check if cputimer isn't running. This is accessed without locking. */ -	if (!READ_ONCE(cputimer->running)) { +	if (!READ_ONCE(pct->timers_active)) { +		struct task_cputime sum; +  		/*  		 * The POSIX timer interface allows for absolute time expiry  		 * values through the TIMER_ABSTIME flag, therefore we have @@ -225,94 +314,69 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)  		update_gt_cputime(&cputimer->cputime_atomic, &sum);  		/* -		 * We're setting cputimer->running without a lock. Ensure -		 * this only gets written to in one operation. We set -		 * running after update_gt_cputime() as a small optimization, -		 * but barriers are not required because update_gt_cputime() +		 * We're setting timers_active without a lock. Ensure this +		 * only gets written to in one operation. We set it after +		 * update_gt_cputime() as a small optimization, but +		 * barriers are not required because update_gt_cputime()  		 * can handle concurrent updates.  		 */ -		WRITE_ONCE(cputimer->running, true); +		WRITE_ONCE(pct->timers_active, true);  	} -	sample_cputime_atomic(times, &cputimer->cputime_atomic); +	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);  } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, -				  struct task_struct *p, -				  u64 *sample) +static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)  { -	struct task_cputime cputime; +	struct task_cputime ct; -	switch (CPUCLOCK_WHICH(which_clock)) { -	default: -		return -EINVAL; -	case CPUCLOCK_PROF: -		thread_group_cputime(p, &cputime); -		*sample = cputime.utime + cputime.stime; -		break; -	case CPUCLOCK_VIRT: -		thread_group_cputime(p, &cputime); -		*sample = cputime.utime; -		break; -	case CPUCLOCK_SCHED: -		thread_group_cputime(p, &cputime); -		*sample = cputime.sum_exec_runtime; -		break; -	} -	return 0; +	thread_group_cputime(tsk, &ct); +	store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);  } -static int posix_cpu_clock_get_task(struct task_struct *tsk, -				    const clockid_t which_clock, -				    struct timespec64 *tp) +/* + * Sample a process (thread group) clock for the given task clkid. If the + * group's cputime accounting is already enabled, read the atomic + * store. Otherwise a full update is required.  Task's sighand lock must be + * held to protect the task traversal on a full update. clkid is already + * validated. + */ +static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, +				  bool start)  { -	int err = -EINVAL; -	u64 rtn; +	struct thread_group_cputimer *cputimer = &p->signal->cputimer; +	struct posix_cputimers *pct = &p->signal->posix_cputimers; +	u64 samples[CPUCLOCK_MAX]; -	if (CPUCLOCK_PERTHREAD(which_clock)) { -		if (same_thread_group(tsk, current)) -			err = cpu_clock_sample(which_clock, tsk, &rtn); +	if (!READ_ONCE(pct->timers_active)) { +		if (start) +			thread_group_start_cputime(p, samples); +		else +			__thread_group_cputime(p, samples);  	} else { -		if (tsk == current || thread_group_leader(tsk)) -			err = cpu_clock_sample_group(which_clock, tsk, &rtn); +		proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);  	} -	if (!err) -		*tp = ns_to_timespec64(rtn); - -	return err; +	return samples[clkid];  } - -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) +static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)  { -	const pid_t pid = CPUCLOCK_PID(which_clock); -	int err = -EINVAL; +	const clockid_t clkid = CPUCLOCK_WHICH(clock); +	struct task_struct *tsk; +	u64 t; -	if (pid == 0) { -		/* -		 * Special case constant value for our own clocks. -		 * We don't have to do any lookup to find ourselves. -		 */ -		err = posix_cpu_clock_get_task(current, which_clock, tp); -	} else { -		/* -		 * Find the given PID, and validate that the caller -		 * should be able to see it. -		 */ -		struct task_struct *p; -		rcu_read_lock(); -		p = find_task_by_vpid(pid); -		if (p) -			err = posix_cpu_clock_get_task(p, which_clock, tp); -		rcu_read_unlock(); -	} +	tsk = get_task_for_clock_get(clock); +	if (!tsk) +		return -EINVAL; -	return err; +	if (CPUCLOCK_PERTHREAD(clock)) +		t = cpu_clock_sample(clkid, tsk); +	else +		t = cpu_clock_sample_group(clkid, tsk, false); +	put_task_struct(tsk); + +	*tp = ns_to_timespec64(t); +	return 0;  }  /* @@ -322,44 +386,15 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *t   */  static int posix_cpu_timer_create(struct k_itimer *new_timer)  { -	int ret = 0; -	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); -	struct task_struct *p; +	struct task_struct *p = get_task_for_clock(new_timer->it_clock); -	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) +	if (!p)  		return -EINVAL;  	new_timer->kclock = &clock_posix_cpu; - -	INIT_LIST_HEAD(&new_timer->it.cpu.entry); - -	rcu_read_lock(); -	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { -		if (pid == 0) { -			p = current; -		} else { -			p = find_task_by_vpid(pid); -			if (p && !same_thread_group(p, current)) -				p = NULL; -		} -	} else { -		if (pid == 0) { -			p = current->group_leader; -		} else { -			p = find_task_by_vpid(pid); -			if (p && !has_group_leader_pid(p)) -				p = NULL; -		} -	} +	timerqueue_init(&new_timer->it.cpu.node);  	new_timer->it.cpu.task = p; -	if (p) { -		get_task_struct(p); -	} else { -		ret = -EINVAL; -	} -	rcu_read_unlock(); - -	return ret; +	return 0;  }  /* @@ -370,12 +405,14 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)   */  static int posix_cpu_timer_del(struct k_itimer *timer)  { -	int ret = 0; -	unsigned long flags; +	struct cpu_timer *ctmr = &timer->it.cpu; +	struct task_struct *p = ctmr->task;  	struct sighand_struct *sighand; -	struct task_struct *p = timer->it.cpu.task; +	unsigned long flags; +	int ret = 0; -	WARN_ON_ONCE(p == NULL); +	if (WARN_ON_ONCE(!p)) +		return -EINVAL;  	/*  	 * Protect against sighand release/switch in exit/exec and process/ @@ -384,15 +421,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)  	sighand = lock_task_sighand(p, &flags);  	if (unlikely(sighand == NULL)) {  		/* -		 * We raced with the reaping of the task. -		 * The deletion should have cleared us off the list. +		 * This raced with the reaping of the task. The exit cleanup +		 * should have removed this timer from the timer queue.  		 */ -		WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); +		WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));  	} else {  		if (timer->it.cpu.firing)  			ret = TIMER_RETRY;  		else -			list_del(&timer->it.cpu.entry); +			cpu_timer_dequeue(ctmr);  		unlock_task_sighand(p, &flags);  	} @@ -403,25 +440,30 @@ static int posix_cpu_timer_del(struct k_itimer *timer)  	return ret;  } -static void cleanup_timers_list(struct list_head *head) +static void cleanup_timerqueue(struct timerqueue_head *head)  { -	struct cpu_timer_list *timer, *next; +	struct timerqueue_node *node; +	struct cpu_timer *ctmr; -	list_for_each_entry_safe(timer, next, head, entry) -		list_del_init(&timer->entry); +	while ((node = timerqueue_getnext(head))) { +		timerqueue_del(head, node); +		ctmr = container_of(node, struct cpu_timer, node); +		ctmr->head = NULL; +	}  }  /* - * Clean out CPU timers still ticking when a thread exited.  The task - * pointer is cleared, and the expiry time is replaced with the residual - * time for later timer_gettime calls to return. + * Clean out CPU timers which are still armed when a thread exits. The + * timers are only removed from the list. No other updates are done. The + * corresponding posix timers are still accessible, but cannot be rearmed. + *   * This must be called with the siglock held.   */ -static void cleanup_timers(struct list_head *head) +static void cleanup_timers(struct posix_cputimers *pct)  { -	cleanup_timers_list(head); -	cleanup_timers_list(++head); -	cleanup_timers_list(++head); +	cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead); +	cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead); +	cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);  }  /* @@ -431,16 +473,11 @@ static void cleanup_timers(struct list_head *head)   */  void posix_cpu_timers_exit(struct task_struct *tsk)  { -	cleanup_timers(tsk->cpu_timers); +	cleanup_timers(&tsk->posix_cputimers);  }  void posix_cpu_timers_exit_group(struct task_struct *tsk)  { -	cleanup_timers(tsk->signal->cpu_timers); -} - -static inline int expires_gt(u64 expires, u64 new_exp) -{ -	return expires == 0 || expires > new_exp; +	cleanup_timers(&tsk->signal->posix_cputimers);  }  /* @@ -449,58 +486,33 @@ static inline int expires_gt(u64 expires, u64 new_exp)   */  static void arm_timer(struct k_itimer *timer)  { -	struct task_struct *p = timer->it.cpu.task; -	struct list_head *head, *listpos; -	struct task_cputime *cputime_expires; -	struct cpu_timer_list *const nt = &timer->it.cpu; -	struct cpu_timer_list *next; - -	if (CPUCLOCK_PERTHREAD(timer->it_clock)) { -		head = p->cpu_timers; -		cputime_expires = &p->cputime_expires; -	} else { -		head = p->signal->cpu_timers; -		cputime_expires = &p->signal->cputime_expires; -	} -	head += CPUCLOCK_WHICH(timer->it_clock); - -	listpos = head; -	list_for_each_entry(next, head, entry) { -		if (nt->expires < next->expires) -			break; -		listpos = &next->entry; -	} -	list_add(&nt->entry, listpos); - -	if (listpos == head) { -		u64 exp = nt->expires; +	int clkidx = CPUCLOCK_WHICH(timer->it_clock); +	struct cpu_timer *ctmr = &timer->it.cpu; +	u64 newexp = cpu_timer_getexpires(ctmr); +	struct task_struct *p = ctmr->task; +	struct posix_cputimer_base *base; + +	if (CPUCLOCK_PERTHREAD(timer->it_clock)) +		base = p->posix_cputimers.bases + clkidx; +	else +		base = p->signal->posix_cputimers.bases + clkidx; + +	if (!cpu_timer_enqueue(&base->tqhead, ctmr)) +		return; -		/* -		 * We are the new earliest-expiring POSIX 1.b timer, hence -		 * need to update expiration cache. Take into account that -		 * for process timers we share expiration cache with itimers -		 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. -		 */ +	/* +	 * We are the new earliest-expiring POSIX 1.b timer, hence +	 * need to update expiration cache. Take into account that +	 * for process timers we share expiration cache with itimers +	 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. +	 */ +	if (newexp < base->nextevt) +		base->nextevt = newexp; -		switch (CPUCLOCK_WHICH(timer->it_clock)) { -		case CPUCLOCK_PROF: -			if (expires_gt(cputime_expires->prof_exp, exp)) -				cputime_expires->prof_exp = exp; -			break; -		case CPUCLOCK_VIRT: -			if (expires_gt(cputime_expires->virt_exp, exp)) -				cputime_expires->virt_exp = exp; -			break; -		case CPUCLOCK_SCHED: -			if (expires_gt(cputime_expires->sched_exp, exp)) -				cputime_expires->sched_exp = exp; -			break; -		} -		if (CPUCLOCK_PERTHREAD(timer->it_clock)) -			tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); -		else -			tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); -	} +	if (CPUCLOCK_PERTHREAD(timer->it_clock)) +		tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); +	else +		tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);  }  /* @@ -508,24 +520,26 @@ static void arm_timer(struct k_itimer *timer)   */  static void cpu_timer_fire(struct k_itimer *timer)  { +	struct cpu_timer *ctmr = &timer->it.cpu; +  	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {  		/*  		 * User don't want any signal.  		 */ -		timer->it.cpu.expires = 0; +		cpu_timer_setexpires(ctmr, 0);  	} else if (unlikely(timer->sigq == NULL)) {  		/*  		 * This a special case for clock_nanosleep,  		 * not a normal timer from sys_timer_create.  		 */  		wake_up_process(timer->it_process); -		timer->it.cpu.expires = 0; +		cpu_timer_setexpires(ctmr, 0);  	} else if (!timer->it_interval) {  		/*  		 * One-shot timer.  Clear it as soon as it's fired.  		 */  		posix_timer_event(timer, 0); -		timer->it.cpu.expires = 0; +		cpu_timer_setexpires(ctmr, 0);  	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {  		/*  		 * The signal did not get queued because the signal @@ -539,33 +553,6 @@ static void cpu_timer_fire(struct k_itimer *timer)  }  /* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, -				  struct task_struct *p, u64 *sample) -{ -	struct task_cputime cputime; - -	thread_group_cputimer(p, &cputime); -	switch (CPUCLOCK_WHICH(which_clock)) { -	default: -		return -EINVAL; -	case CPUCLOCK_PROF: -		*sample = cputime.utime + cputime.stime; -		break; -	case CPUCLOCK_VIRT: -		*sample = cputime.utime; -		break; -	case CPUCLOCK_SCHED: -		*sample = cputime.sum_exec_runtime; -		break; -	} -	return 0; -} - -/*   * Guts of sys_timer_settime for CPU timers.   * This is called with the timer locked and interrupts disabled.   * If we return TIMER_RETRY, it's necessary to release the timer's lock @@ -574,13 +561,16 @@ static int cpu_timer_sample_group(const clockid_t which_clock,  static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  			       struct itimerspec64 *new, struct itimerspec64 *old)  { -	unsigned long flags; -	struct sighand_struct *sighand; -	struct task_struct *p = timer->it.cpu.task; +	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);  	u64 old_expires, new_expires, old_incr, val; -	int ret; +	struct cpu_timer *ctmr = &timer->it.cpu; +	struct task_struct *p = ctmr->task; +	struct sighand_struct *sighand; +	unsigned long flags; +	int ret = 0; -	WARN_ON_ONCE(p == NULL); +	if (WARN_ON_ONCE(!p)) +		return -EINVAL;  	/*  	 * Use the to_ktime conversion because that clamps the maximum @@ -597,22 +587,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  	 * If p has just been reaped, we can no  	 * longer get any information about it at all.  	 */ -	if (unlikely(sighand == NULL)) { +	if (unlikely(sighand == NULL))  		return -ESRCH; -	}  	/*  	 * Disarm any old timer after extracting its expiry time.  	 */ - -	ret = 0;  	old_incr = timer->it_interval; -	old_expires = timer->it.cpu.expires; +	old_expires = cpu_timer_getexpires(ctmr); +  	if (unlikely(timer->it.cpu.firing)) {  		timer->it.cpu.firing = -1;  		ret = TIMER_RETRY; -	} else -		list_del_init(&timer->it.cpu.entry); +	} else { +		cpu_timer_dequeue(ctmr); +	}  	/*  	 * We need to sample the current value to convert the new @@ -622,11 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  	 * times (in arm_timer).  With an absolute time, we must  	 * check if it's already passed.  In short, we need a sample.  	 */ -	if (CPUCLOCK_PERTHREAD(timer->it_clock)) { -		cpu_clock_sample(timer->it_clock, p, &val); -	} else { -		cpu_timer_sample_group(timer->it_clock, p, &val); -	} +	if (CPUCLOCK_PERTHREAD(timer->it_clock)) +		val = cpu_clock_sample(clkid, p); +	else +		val = cpu_clock_sample_group(clkid, p, true);  	if (old) {  		if (old_expires == 0) { @@ -634,18 +622,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  			old->it_value.tv_nsec = 0;  		} else {  			/* -			 * Update the timer in case it has -			 * overrun already.  If it has, -			 * we'll report it as having overrun -			 * and with the next reloaded timer -			 * already ticking, though we are -			 * swallowing that pending -			 * notification here to install the -			 * new setting. +			 * Update the timer in case it has overrun already. +			 * If it has, we'll report it as having overrun and +			 * with the next reloaded timer already ticking, +			 * though we are swallowing that pending +			 * notification here to install the new setting.  			 */ -			bump_cpu_timer(timer, val); -			if (val < timer->it.cpu.expires) { -				old_expires = timer->it.cpu.expires - val; +			u64 exp = bump_cpu_timer(timer, val); + +			if (val < exp) { +				old_expires = exp - val;  				old->it_value = ns_to_timespec64(old_expires);  			} else {  				old->it_value.tv_nsec = 1; @@ -674,7 +660,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  	 * For a timer with no notification action, we don't actually  	 * arm the timer (we'll just fake it for timer_gettime).  	 */ -	timer->it.cpu.expires = new_expires; +	cpu_timer_setexpires(ctmr, new_expires);  	if (new_expires != 0 && val < new_expires) {  		arm_timer(timer);  	} @@ -715,24 +701,27 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,  static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)  { -	u64 now; -	struct task_struct *p = timer->it.cpu.task; +	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); +	struct cpu_timer *ctmr = &timer->it.cpu; +	u64 now, expires = cpu_timer_getexpires(ctmr); +	struct task_struct *p = ctmr->task; -	WARN_ON_ONCE(p == NULL); +	if (WARN_ON_ONCE(!p)) +		return;  	/*  	 * Easy part: convert the reload time.  	 */  	itp->it_interval = ktime_to_timespec64(timer->it_interval); -	if (!timer->it.cpu.expires) +	if (!expires)  		return;  	/*  	 * Sample the clock to take the difference with the expiry time.  	 */  	if (CPUCLOCK_PERTHREAD(timer->it_clock)) { -		cpu_clock_sample(timer->it_clock, p, &now); +		now = cpu_clock_sample(clkid, p);  	} else {  		struct sighand_struct *sighand;  		unsigned long flags; @@ -747,18 +736,18 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp  			/*  			 * The process has been reaped.  			 * We can't even collect a sample any more. -			 * Call the timer disarmed, nothing else to do. +			 * Disarm the timer, nothing else to do.  			 */ -			timer->it.cpu.expires = 0; +			cpu_timer_setexpires(ctmr, 0);  			return;  		} else { -			cpu_timer_sample_group(timer->it_clock, p, &now); +			now = cpu_clock_sample_group(clkid, p, false);  			unlock_task_sighand(p, &flags);  		}  	} -	if (now < timer->it.cpu.expires) { -		itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); +	if (now < expires) { +		itp->it_value = ns_to_timespec64(expires - now);  	} else {  		/*  		 * The timer should have expired already, but the firing @@ -769,26 +758,42 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp  	}  } -static unsigned long long -check_timers_list(struct list_head *timers, -		  struct list_head *firing, -		  unsigned long long curr) -{ -	int maxfire = 20; +#define MAX_COLLECTED	20 -	while (!list_empty(timers)) { -		struct cpu_timer_list *t; +static u64 collect_timerqueue(struct timerqueue_head *head, +			      struct list_head *firing, u64 now) +{ +	struct timerqueue_node *next; +	int i = 0; + +	while ((next = timerqueue_getnext(head))) { +		struct cpu_timer *ctmr; +		u64 expires; + +		ctmr = container_of(next, struct cpu_timer, node); +		expires = cpu_timer_getexpires(ctmr); +		/* Limit the number of timers to expire at once */ +		if (++i == MAX_COLLECTED || now < expires) +			return expires; + +		ctmr->firing = 1; +		cpu_timer_dequeue(ctmr); +		list_add_tail(&ctmr->elist, firing); +	} -		t = list_first_entry(timers, struct cpu_timer_list, entry); +	return U64_MAX; +} -		if (!--maxfire || curr < t->expires) -			return t->expires; +static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, +				    struct list_head *firing) +{ +	struct posix_cputimer_base *base = pct->bases; +	int i; -		t->firing = 1; -		list_move_tail(&t->entry, firing); +	for (i = 0; i < CPUCLOCK_MAX; i++, base++) { +		base->nextevt = collect_timerqueue(&base->tqhead, firing, +						    samples[i]);  	} - -	return 0;  }  static inline void check_dl_overrun(struct task_struct *tsk) @@ -799,6 +804,20 @@ static inline void check_dl_overrun(struct task_struct *tsk)  	}  } +static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) +{ +	if (time < limit) +		return false; + +	if (print_fatal_signals) { +		pr_info("%s Watchdog Timeout (%s): %s[%d]\n", +			rt ? "RT" : "CPU", hard ? "hard" : "soft", +			current->comm, task_pid_nr(current)); +	} +	__group_send_sig_info(signo, SEND_SIG_PRIV, current); +	return true; +} +  /*   * Check for any per-thread CPU timers that have fired and move them off   * the tsk->cpu_timers[N] list onto the firing list.  Here we update the @@ -807,76 +826,50 @@ static inline void check_dl_overrun(struct task_struct *tsk)  static void check_thread_timers(struct task_struct *tsk,  				struct list_head *firing)  { -	struct list_head *timers = tsk->cpu_timers; -	struct task_cputime *tsk_expires = &tsk->cputime_expires; -	u64 expires; +	struct posix_cputimers *pct = &tsk->posix_cputimers; +	u64 samples[CPUCLOCK_MAX];  	unsigned long soft;  	if (dl_task(tsk))  		check_dl_overrun(tsk); -	/* -	 * If cputime_expires is zero, then there are no active -	 * per thread CPU timers. -	 */ -	if (task_cputime_zero(&tsk->cputime_expires)) +	if (expiry_cache_is_inactive(pct))  		return; -	expires = check_timers_list(timers, firing, prof_ticks(tsk)); -	tsk_expires->prof_exp = expires; - -	expires = check_timers_list(++timers, firing, virt_ticks(tsk)); -	tsk_expires->virt_exp = expires; - -	tsk_expires->sched_exp = check_timers_list(++timers, firing, -						   tsk->se.sum_exec_runtime); +	task_sample_cputime(tsk, samples); +	collect_posix_cputimers(pct, samples, firing);  	/*  	 * Check for the special case thread timers.  	 */  	soft = task_rlimit(tsk, RLIMIT_RTTIME);  	if (soft != RLIM_INFINITY) { +		/* Task RT timeout is accounted in jiffies. RTTIME is usec */ +		unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);  		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); +		/* At the hard limit, send SIGKILL. No further action. */  		if (hard != RLIM_INFINITY && -		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { -			/* -			 * At the hard limit, we just die. -			 * No need to calculate anything else now. -			 */ -			if (print_fatal_signals) { -				pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", -					tsk->comm, task_pid_nr(tsk)); -			} -			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); +		    check_rlimit(rttime, hard, SIGKILL, true, true))  			return; -		} -		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { -			/* -			 * At the soft limit, send a SIGXCPU every second. -			 */ -			if (soft < hard) { -				soft += USEC_PER_SEC; -				tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = -					soft; -			} -			if (print_fatal_signals) { -				pr_info("RT Watchdog Timeout (soft): %s[%d]\n", -					tsk->comm, task_pid_nr(tsk)); -			} -			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + +		/* At the soft limit, send a SIGXCPU every second */ +		if (check_rlimit(rttime, soft, SIGXCPU, true, false)) { +			soft += USEC_PER_SEC; +			tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;  		}  	} -	if (task_cputime_zero(tsk_expires)) + +	if (expiry_cache_is_inactive(pct))  		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);  }  static inline void stop_process_timers(struct signal_struct *sig)  { -	struct thread_group_cputimer *cputimer = &sig->cputimer; +	struct posix_cputimers *pct = &sig->posix_cputimers; -	/* Turn off cputimer->running. This is done without locking. */ -	WRITE_ONCE(cputimer->running, false); +	/* Turn off the active flag. This is done without locking. */ +	WRITE_ONCE(pct->timers_active, false);  	tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);  } @@ -898,7 +891,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,  		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);  	} -	if (it->expires && (!*expires || it->expires < *expires)) +	if (it->expires && it->expires < *expires)  		*expires = it->expires;  } @@ -911,87 +904,69 @@ static void check_process_timers(struct task_struct *tsk,  				 struct list_head *firing)  {  	struct signal_struct *const sig = tsk->signal; -	u64 utime, ptime, virt_expires, prof_expires; -	u64 sum_sched_runtime, sched_expires; -	struct list_head *timers = sig->cpu_timers; -	struct task_cputime cputime; +	struct posix_cputimers *pct = &sig->posix_cputimers; +	u64 samples[CPUCLOCK_MAX];  	unsigned long soft;  	/* -	 * If cputimer is not running, then there are no active -	 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). +	 * If there are no active process wide timers (POSIX 1.b, itimers, +	 * RLIMIT_CPU) nothing to check. Also skip the process wide timer +	 * processing when there is already another task handling them.  	 */ -	if (!READ_ONCE(tsk->signal->cputimer.running)) +	if (!READ_ONCE(pct->timers_active) || pct->expiry_active)  		return; -        /* +	/*  	 * Signify that a thread is checking for process timers.  	 * Write access to this field is protected by the sighand lock.  	 */ -	sig->cputimer.checking_timer = true; +	pct->expiry_active = true;  	/* -	 * Collect the current process totals. +	 * Collect the current process totals. Group accounting is active +	 * so the sample can be taken directly.  	 */ -	thread_group_cputimer(tsk, &cputime); -	utime = cputime.utime; -	ptime = utime + cputime.stime; -	sum_sched_runtime = cputime.sum_exec_runtime; - -	prof_expires = check_timers_list(timers, firing, ptime); -	virt_expires = check_timers_list(++timers, firing, utime); -	sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); +	proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); +	collect_posix_cputimers(pct, samples, firing);  	/*  	 * Check for the special case process timers.  	 */ -	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, -			 SIGPROF); -	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, -			 SIGVTALRM); +	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], +			 &pct->bases[CPUCLOCK_PROF].nextevt, +			 samples[CPUCLOCK_PROF], SIGPROF); +	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], +			 &pct->bases[CPUCLOCK_VIRT].nextevt, +			 samples[CPUCLOCK_VIRT], SIGVTALRM); +  	soft = task_rlimit(tsk, RLIMIT_CPU);  	if (soft != RLIM_INFINITY) { -		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); +		/* RLIMIT_CPU is in seconds. Samples are nanoseconds */  		unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); -		u64 x; -		if (psecs >= hard) { -			/* -			 * At the hard limit, we just die. -			 * No need to calculate anything else now. -			 */ -			if (print_fatal_signals) { -				pr_info("RT Watchdog Timeout (hard): %s[%d]\n", -					tsk->comm, task_pid_nr(tsk)); -			} -			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); +		u64 ptime = samples[CPUCLOCK_PROF]; +		u64 softns = (u64)soft * NSEC_PER_SEC; +		u64 hardns = (u64)hard * NSEC_PER_SEC; + +		/* At the hard limit, send SIGKILL. No further action. */ +		if (hard != RLIM_INFINITY && +		    check_rlimit(ptime, hardns, SIGKILL, false, true))  			return; + +		/* At the soft limit, send a SIGXCPU every second */ +		if (check_rlimit(ptime, softns, SIGXCPU, false, false)) { +			sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; +			softns += NSEC_PER_SEC;  		} -		if (psecs >= soft) { -			/* -			 * At the soft limit, send a SIGXCPU every second. -			 */ -			if (print_fatal_signals) { -				pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", -					tsk->comm, task_pid_nr(tsk)); -			} -			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); -			if (soft < hard) { -				soft++; -				sig->rlim[RLIMIT_CPU].rlim_cur = soft; -			} -		} -		x = soft * NSEC_PER_SEC; -		if (!prof_expires || x < prof_expires) -			prof_expires = x; + +		/* Update the expiry cache */ +		if (softns < pct->bases[CPUCLOCK_PROF].nextevt) +			pct->bases[CPUCLOCK_PROF].nextevt = softns;  	} -	sig->cputime_expires.prof_exp = prof_expires; -	sig->cputime_expires.virt_exp = virt_expires; -	sig->cputime_expires.sched_exp = sched_expires; -	if (task_cputime_zero(&sig->cputime_expires)) +	if (expiry_cache_is_inactive(pct))  		stop_process_timers(sig); -	sig->cputimer.checking_timer = false; +	pct->expiry_active = false;  }  /* @@ -1000,18 +975,21 @@ static void check_process_timers(struct task_struct *tsk,   */  static void posix_cpu_timer_rearm(struct k_itimer *timer)  { +	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); +	struct cpu_timer *ctmr = &timer->it.cpu; +	struct task_struct *p = ctmr->task;  	struct sighand_struct *sighand;  	unsigned long flags; -	struct task_struct *p = timer->it.cpu.task;  	u64 now; -	WARN_ON_ONCE(p == NULL); +	if (WARN_ON_ONCE(!p)) +		return;  	/*  	 * Fetch the current sample and update the timer's expiry time.  	 */  	if (CPUCLOCK_PERTHREAD(timer->it_clock)) { -		cpu_clock_sample(timer->it_clock, p, &now); +		now = cpu_clock_sample(clkid, p);  		bump_cpu_timer(timer, now);  		if (unlikely(p->exit_state))  			return; @@ -1031,13 +1009,13 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)  			 * The process has been reaped.  			 * We can't even collect a sample any more.  			 */ -			timer->it.cpu.expires = 0; +			cpu_timer_setexpires(ctmr, 0);  			return;  		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {  			/* If the process is dying, no need to rearm */  			goto unlock;  		} -		cpu_timer_sample_group(timer->it_clock, p, &now); +		now = cpu_clock_sample_group(clkid, p, true);  		bump_cpu_timer(timer, now);  		/* Leave the sighand locked for the call below.  */  	} @@ -1051,26 +1029,24 @@ unlock:  }  /** - * task_cputime_expired - Compare two task_cputime entities. + * task_cputimers_expired - Check whether posix CPU timers are expired   * - * @sample:	The task_cputime structure to be checked for expiration. - * @expires:	Expiration times, against which @sample will be checked. + * @samples:	Array of current samples for the CPUCLOCK clocks + * @pct:	Pointer to a posix_cputimers container   * - * Checks @sample against @expires to see if any field of @sample has expired. - * Returns true if any field of the former is greater than the corresponding - * field of the latter if the latter field is set.  Otherwise returns false. + * Returns true if any member of @samples is greater than the corresponding + * member of @pct->bases[CLK].nextevt. False otherwise   */ -static inline int task_cputime_expired(const struct task_cputime *sample, -					const struct task_cputime *expires) +static inline bool +task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct)  { -	if (expires->utime && sample->utime >= expires->utime) -		return 1; -	if (expires->stime && sample->utime + sample->stime >= expires->stime) -		return 1; -	if (expires->sum_exec_runtime != 0 && -	    sample->sum_exec_runtime >= expires->sum_exec_runtime) -		return 1; -	return 0; +	int i; + +	for (i = 0; i < CPUCLOCK_MAX; i++) { +		if (sample[i] >= pct->bases[i].nextevt) +			return true; +	} +	return false;  }  /** @@ -1083,48 +1059,50 @@ static inline int task_cputime_expired(const struct task_cputime *sample,   * timers and compare them with the corresponding expiration times.  Return   * true if a timer has expired, else return false.   */ -static inline int fastpath_timer_check(struct task_struct *tsk) +static inline bool fastpath_timer_check(struct task_struct *tsk)  { +	struct posix_cputimers *pct = &tsk->posix_cputimers;  	struct signal_struct *sig; -	if (!task_cputime_zero(&tsk->cputime_expires)) { -		struct task_cputime task_sample; +	if (!expiry_cache_is_inactive(pct)) { +		u64 samples[CPUCLOCK_MAX]; -		task_cputime(tsk, &task_sample.utime, &task_sample.stime); -		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; -		if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) -			return 1; +		task_sample_cputime(tsk, samples); +		if (task_cputimers_expired(samples, pct)) +			return true;  	}  	sig = tsk->signal; +	pct = &sig->posix_cputimers;  	/* -	 * Check if thread group timers expired when the cputimer is -	 * running and no other thread in the group is already checking -	 * for thread group cputimers. These fields are read without the -	 * sighand lock. However, this is fine because this is meant to -	 * be a fastpath heuristic to determine whether we should try to -	 * acquire the sighand lock to check/handle timers. +	 * Check if thread group timers expired when timers are active and +	 * no other thread in the group is already handling expiry for +	 * thread group cputimers. These fields are read without the +	 * sighand lock. However, this is fine because this is meant to be +	 * a fastpath heuristic to determine whether we should try to +	 * acquire the sighand lock to handle timer expiry.  	 * -	 * In the worst case scenario, if 'running' or 'checking_timer' gets -	 * set but the current thread doesn't see the change yet, we'll wait -	 * until the next thread in the group gets a scheduler interrupt to -	 * handle the timer. This isn't an issue in practice because these -	 * types of delays with signals actually getting sent are expected. +	 * In the worst case scenario, if concurrently timers_active is set +	 * or expiry_active is cleared, but the current thread doesn't see +	 * the change yet, the timer checks are delayed until the next +	 * thread in the group gets a scheduler interrupt to handle the +	 * timer. This isn't an issue in practice because these types of +	 * delays with signals actually getting sent are expected.  	 */ -	if (READ_ONCE(sig->cputimer.running) && -	    !READ_ONCE(sig->cputimer.checking_timer)) { -		struct task_cputime group_sample; +	if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) { +		u64 samples[CPUCLOCK_MAX]; -		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); +		proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, +					   samples); -		if (task_cputime_expired(&group_sample, &sig->cputime_expires)) -			return 1; +		if (task_cputimers_expired(samples, pct)) +			return true;  	}  	if (dl_task(tsk) && tsk->dl.dl_overrun) -		return 1; +		return true; -	return 0; +	return false;  }  /* @@ -1132,11 +1110,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk)   * already updated our counts.  We need to check if any timers fire now.   * Interrupts are disabled.   */ -void run_posix_cpu_timers(struct task_struct *tsk) +void run_posix_cpu_timers(void)  { -	LIST_HEAD(firing); +	struct task_struct *tsk = current;  	struct k_itimer *timer, *next;  	unsigned long flags; +	LIST_HEAD(firing);  	lockdep_assert_irqs_disabled(); @@ -1174,11 +1153,11 @@ void run_posix_cpu_timers(struct task_struct *tsk)  	 * each timer's lock before clearing its firing flag, so no  	 * timer call will interfere.  	 */ -	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { +	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {  		int cpu_firing;  		spin_lock(&timer->it_lock); -		list_del_init(&timer->it.cpu.entry); +		list_del_init(&timer->it.cpu.elist);  		cpu_firing = timer->it.cpu.firing;  		timer->it.cpu.firing = 0;  		/* @@ -1196,16 +1175,18 @@ void run_posix_cpu_timers(struct task_struct *tsk)   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.   * The tsk->sighand->siglock must be held by the caller.   */ -void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,  			   u64 *newval, u64 *oldval)  { -	u64 now; -	int ret; +	u64 now, *nextevt; + +	if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) +		return; -	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); -	ret = cpu_timer_sample_group(clock_idx, tsk, &now); +	nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt; +	now = cpu_clock_sample_group(clkid, tsk, true); -	if (oldval && ret != -EINVAL) { +	if (oldval) {  		/*  		 * We are setting itimer. The *oldval is absolute and we update  		 * it to be relative, *newval argument is relative and we update @@ -1226,19 +1207,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  	}  	/* -	 * Update expiration cache if we are the earliest timer, or eventually -	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. +	 * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF +	 * expiry cache is also used by RLIMIT_CPU!.  	 */ -	switch (clock_idx) { -	case CPUCLOCK_PROF: -		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) -			tsk->signal->cputime_expires.prof_exp = *newval; -		break; -	case CPUCLOCK_VIRT: -		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) -			tsk->signal->cputime_expires.virt_exp = *newval; -		break; -	} +	if (*newval < *nextevt) +		*nextevt = *newval;  	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);  } @@ -1260,6 +1233,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  	timer.it_overrun = -1;  	error = posix_cpu_timer_create(&timer);  	timer.it_process = current; +  	if (!error) {  		static struct itimerspec64 zero_it;  		struct restart_block *restart; @@ -1275,7 +1249,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		}  		while (!signal_pending(current)) { -			if (timer.it.cpu.expires == 0) { +			if (!cpu_timer_getexpires(&timer.it.cpu)) {  				/*  				 * Our timer fired and was reset, below  				 * deletion can not fail. @@ -1297,7 +1271,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		/*  		 * We were interrupted by a signal.  		 */ -		expires = timer.it.cpu.expires; +		expires = cpu_timer_getexpires(&timer.it.cpu);  		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);  		if (!error) {  			/* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d7f2d91acdac..0ec5b7a1d769 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -442,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void)  static void k_itimer_rcu_free(struct rcu_head *head)  { -	struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); +	struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);  	kmem_cache_free(posix_timers_cache, tmr);  } @@ -459,7 +459,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)  	}  	put_pid(tmr->it_pid);  	sigqueue_free(tmr->sigq); -	call_rcu(&tmr->it.rcu, k_itimer_rcu_free); +	call_rcu(&tmr->rcu, k_itimer_rcu_free);  }  static int common_timer_create(struct k_itimer *new_timer) @@ -805,6 +805,35 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr)  	return hrtimer_try_to_cancel(&timr->it.real.timer);  } +static void common_timer_wait_running(struct k_itimer *timer) +{ +	hrtimer_cancel_wait_running(&timer->it.real.timer); +} + +/* + * On PREEMPT_RT this prevent priority inversion against softirq kthread in + * case it gets preempted while executing a timer callback. See comments in + * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a + * cpu_relax(). + */ +static struct k_itimer *timer_wait_running(struct k_itimer *timer, +					   unsigned long *flags) +{ +	const struct k_clock *kc = READ_ONCE(timer->kclock); +	timer_t timer_id = READ_ONCE(timer->it_id); + +	/* Prevent kfree(timer) after dropping the lock */ +	rcu_read_lock(); +	unlock_timer(timer, *flags); + +	if (!WARN_ON_ONCE(!kc->timer_wait_running)) +		kc->timer_wait_running(timer); + +	rcu_read_unlock(); +	/* Relock the timer. It might be not longer hashed. */ +	return lock_timer(timer_id, flags); +} +  /* Set a POSIX.1b interval timer. */  int common_timer_set(struct k_itimer *timr, int flags,  		     struct itimerspec64 *new_setting, @@ -844,13 +873,13 @@ int common_timer_set(struct k_itimer *timr, int flags,  	return 0;  } -static int do_timer_settime(timer_t timer_id, int flags, +static int do_timer_settime(timer_t timer_id, int tmr_flags,  			    struct itimerspec64 *new_spec64,  			    struct itimerspec64 *old_spec64)  {  	const struct k_clock *kc;  	struct k_itimer *timr; -	unsigned long flag; +	unsigned long flags;  	int error = 0;  	if (!timespec64_valid(&new_spec64->it_interval) || @@ -859,8 +888,9 @@ static int do_timer_settime(timer_t timer_id, int flags,  	if (old_spec64)  		memset(old_spec64, 0, sizeof(*old_spec64)); + +	timr = lock_timer(timer_id, &flags);  retry: -	timr = lock_timer(timer_id, &flag);  	if (!timr)  		return -EINVAL; @@ -868,13 +898,16 @@ retry:  	if (WARN_ON_ONCE(!kc || !kc->timer_set))  		error = -EINVAL;  	else -		error = kc->timer_set(timr, flags, new_spec64, old_spec64); +		error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); -	unlock_timer(timr, flag);  	if (error == TIMER_RETRY) { -		old_spec64 = NULL;	// We already got the old time... +		// We already got the old time... +		old_spec64 = NULL; +		/* Unlocks and relocks the timer if it still exists */ +		timr = timer_wait_running(timr, &flags);  		goto retry;  	} +	unlock_timer(timr, flags);  	return error;  } @@ -951,13 +984,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)  	struct k_itimer *timer;  	unsigned long flags; -retry_delete:  	timer = lock_timer(timer_id, &flags); + +retry_delete:  	if (!timer)  		return -EINVAL; -	if (timer_delete_hook(timer) == TIMER_RETRY) { -		unlock_timer(timer, flags); +	if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { +		/* Unlocks and relocks the timer if it still exists */ +		timer = timer_wait_running(timer, &flags);  		goto retry_delete;  	} @@ -1238,6 +1273,7 @@ static const struct k_clock clock_realtime = {  	.timer_forward		= common_hrtimer_forward,  	.timer_remaining	= common_hrtimer_remaining,  	.timer_try_to_cancel	= common_hrtimer_try_to_cancel, +	.timer_wait_running	= common_timer_wait_running,  	.timer_arm		= common_hrtimer_arm,  }; @@ -1253,6 +1289,7 @@ static const struct k_clock clock_monotonic = {  	.timer_forward		= common_hrtimer_forward,  	.timer_remaining	= common_hrtimer_remaining,  	.timer_try_to_cancel	= common_hrtimer_try_to_cancel, +	.timer_wait_running	= common_timer_wait_running,  	.timer_arm		= common_hrtimer_arm,  }; @@ -1283,6 +1320,7 @@ static const struct k_clock clock_tai = {  	.timer_forward		= common_hrtimer_forward,  	.timer_remaining	= common_hrtimer_remaining,  	.timer_try_to_cancel	= common_hrtimer_try_to_cancel, +	.timer_wait_running	= common_timer_wait_running,  	.timer_arm		= common_hrtimer_arm,  }; @@ -1298,6 +1336,7 @@ static const struct k_clock clock_boottime = {  	.timer_forward		= common_hrtimer_forward,  	.timer_remaining	= common_hrtimer_remaining,  	.timer_try_to_cancel	= common_hrtimer_try_to_cancel, +	.timer_wait_running	= common_timer_wait_running,  	.timer_arm		= common_hrtimer_arm,  }; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index de5daa6d975a..897c29e162b9 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -24,6 +24,7 @@ struct k_clock {  	int	(*timer_try_to_cancel)(struct k_itimer *timr);  	void	(*timer_arm)(struct k_itimer *timr, ktime_t expires,  			     bool absolute, bool sigev_none); +	void	(*timer_wait_running)(struct k_itimer *timr);  };  extern const struct k_clock clock_posix_cpu; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 5be6154e2fd2..c1f5bb590b5e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -59,11 +59,16 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)  	 * hrtimer_{start/cancel} functions call into tracing,  	 * calls to these functions must be bound within RCU_NONIDLE.  	 */ -	RCU_NONIDLE({ +	RCU_NONIDLE( +		{  			bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; -			if (bc_moved) +			if (bc_moved) {  				hrtimer_start(&bctimer, expires, -					      HRTIMER_MODE_ABS_PINNED);}); +					      HRTIMER_MODE_ABS_PINNED_HARD); +			} +		} +	); +  	if (bc_moved) {  		/* Bind the "device" to the cpu */  		bc->bound_on = smp_processor_id(); @@ -104,7 +109,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)  void tick_setup_hrtimer_broadcast(void)  { -	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);  	bctimer.function = bc_handler;  	clockevents_register_device(&ce_broadcast_hrtimer);  } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index be9707f68024..955851748dc3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -634,10 +634,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)  	/* Forward the time to expire in the future */  	hrtimer_forward(&ts->sched_timer, now, tick_period); -	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) -		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); -	else +	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { +		hrtimer_start_expires(&ts->sched_timer, +				      HRTIMER_MODE_ABS_PINNED_HARD); +	} else {  		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +	}  	/*  	 * Reset to make sure next tick stop doesn't get fooled by past @@ -802,7 +804,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)  	}  	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { -		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); +		hrtimer_start(&ts->sched_timer, tick, +			      HRTIMER_MODE_ABS_PINNED_HARD);  	} else {  		hrtimer_set_expires(&ts->sched_timer, tick);  		tick_program_event(tick, 1); @@ -1230,7 +1233,7 @@ static void tick_nohz_switch_to_nohz(void)  	 * Recycle the hrtimer in ts, so we can share the  	 * hrtimer_forward with the highres code.  	 */ -	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);  	/* Get the next period */  	next = tick_init_jiffy_update(); @@ -1327,7 +1330,7 @@ void tick_setup_sched_timer(void)  	/*  	 * Emulate tick processing via per-CPU hrtimers:  	 */ -	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);  	ts->sched_timer.function = tick_sched_timer;  	/* Get the next period (per-CPU) */ @@ -1342,7 +1345,7 @@ void tick_setup_sched_timer(void)  	}  	hrtimer_forward(&ts->sched_timer, now, tick_period); -	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); +	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);  	tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);  }  #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 343c7ba33b1c..4820823515e9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -196,6 +196,10 @@ EXPORT_SYMBOL(jiffies_64);  struct timer_base {  	raw_spinlock_t		lock;  	struct timer_list	*running_timer; +#ifdef CONFIG_PREEMPT_RT +	spinlock_t		expiry_lock; +	atomic_t		timer_waiters; +#endif  	unsigned long		clk;  	unsigned long		next_expiry;  	unsigned int		cpu; @@ -1227,7 +1231,78 @@ int try_to_del_timer_sync(struct timer_list *timer)  }  EXPORT_SYMBOL(try_to_del_timer_sync); -#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT +static __init void timer_base_init_expiry_lock(struct timer_base *base) +{ +	spin_lock_init(&base->expiry_lock); +} + +static inline void timer_base_lock_expiry(struct timer_base *base) +{ +	spin_lock(&base->expiry_lock); +} + +static inline void timer_base_unlock_expiry(struct timer_base *base) +{ +	spin_unlock(&base->expiry_lock); +} + +/* + * The counterpart to del_timer_wait_running(). + * + * If there is a waiter for base->expiry_lock, then it was waiting for the + * timer callback to finish. Drop expiry_lock and reaquire it. That allows + * the waiter to acquire the lock and make progress. + */ +static void timer_sync_wait_running(struct timer_base *base) +{ +	if (atomic_read(&base->timer_waiters)) { +		spin_unlock(&base->expiry_lock); +		spin_lock(&base->expiry_lock); +	} +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion, if the softirq thread on a remote CPU + * got preempted, and it prevents a life lock when the task which tries to + * delete a timer preempted the softirq thread running the timer callback + * function. + */ +static void del_timer_wait_running(struct timer_list *timer) +{ +	u32 tf; + +	tf = READ_ONCE(timer->flags); +	if (!(tf & TIMER_MIGRATING)) { +		struct timer_base *base = get_timer_base(tf); + +		/* +		 * Mark the base as contended and grab the expiry lock, +		 * which is held by the softirq across the timer +		 * callback. Drop the lock immediately so the softirq can +		 * expire the next timer. In theory the timer could already +		 * be running again, but that's more than unlikely and just +		 * causes another wait loop. +		 */ +		atomic_inc(&base->timer_waiters); +		spin_lock_bh(&base->expiry_lock); +		atomic_dec(&base->timer_waiters); +		spin_unlock_bh(&base->expiry_lock); +	} +} +#else +static inline void timer_base_init_expiry_lock(struct timer_base *base) { } +static inline void timer_base_lock_expiry(struct timer_base *base) { } +static inline void timer_base_unlock_expiry(struct timer_base *base) { } +static inline void timer_sync_wait_running(struct timer_base *base) { } +static inline void del_timer_wait_running(struct timer_list *timer) { } +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)  /**   * del_timer_sync - deactivate a timer and wait for the handler to finish.   * @timer: the timer to be deactivated @@ -1266,6 +1341,8 @@ EXPORT_SYMBOL(try_to_del_timer_sync);   */  int del_timer_sync(struct timer_list *timer)  { +	int ret; +  #ifdef CONFIG_LOCKDEP  	unsigned long flags; @@ -1283,12 +1360,17 @@ int del_timer_sync(struct timer_list *timer)  	 * could lead to deadlock.  	 */  	WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); -	for (;;) { -		int ret = try_to_del_timer_sync(timer); -		if (ret >= 0) -			return ret; -		cpu_relax(); -	} + +	do { +		ret = try_to_del_timer_sync(timer); + +		if (unlikely(ret < 0)) { +			del_timer_wait_running(timer); +			cpu_relax(); +		} +	} while (ret < 0); + +	return ret;  }  EXPORT_SYMBOL(del_timer_sync);  #endif @@ -1360,10 +1442,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)  		if (timer->flags & TIMER_IRQSAFE) {  			raw_spin_unlock(&base->lock);  			call_timer_fn(timer, fn, baseclk); +			base->running_timer = NULL;  			raw_spin_lock(&base->lock);  		} else {  			raw_spin_unlock_irq(&base->lock);  			call_timer_fn(timer, fn, baseclk); +			base->running_timer = NULL; +			timer_sync_wait_running(base);  			raw_spin_lock_irq(&base->lock);  		}  	} @@ -1593,24 +1678,26 @@ void timer_clear_idle(void)  static int collect_expired_timers(struct timer_base *base,  				  struct hlist_head *heads)  { +	unsigned long now = READ_ONCE(jiffies); +  	/*  	 * NOHZ optimization. After a long idle sleep we need to forward the  	 * base to current jiffies. Avoid a loop by searching the bitfield for  	 * the next expiring timer.  	 */ -	if ((long)(jiffies - base->clk) > 2) { +	if ((long)(now - base->clk) > 2) {  		unsigned long next = __next_timer_interrupt(base);  		/*  		 * If the next timer is ahead of time forward to current  		 * jiffies, otherwise forward to the next expiry time:  		 */ -		if (time_after(next, jiffies)) { +		if (time_after(next, now)) {  			/*  			 * The call site will increment base->clk and then  			 * terminate the expiry loop immediately.  			 */ -			base->clk = jiffies; +			base->clk = now;  			return 0;  		}  		base->clk = next; @@ -1643,7 +1730,7 @@ void update_process_times(int user_tick)  #endif  	scheduler_tick();  	if (IS_ENABLED(CONFIG_POSIX_TIMERS)) -		run_posix_cpu_timers(p); +		run_posix_cpu_timers();  }  /** @@ -1658,6 +1745,7 @@ static inline void __run_timers(struct timer_base *base)  	if (!time_after_eq(jiffies, base->clk))  		return; +	timer_base_lock_expiry(base);  	raw_spin_lock_irq(&base->lock);  	/* @@ -1684,8 +1772,8 @@ static inline void __run_timers(struct timer_base *base)  		while (levels--)  			expire_timers(base, heads + levels);  	} -	base->running_timer = NULL;  	raw_spin_unlock_irq(&base->lock); +	timer_base_unlock_expiry(base);  }  /* @@ -1930,6 +2018,7 @@ static void __init init_timer_cpu(int cpu)  		base->cpu = cpu;  		raw_spin_lock_init(&base->lock);  		base->clk = jiffies; +		timer_base_init_expiry_lock(base);  	}  } diff --git a/kernel/torture.c b/kernel/torture.c index a8d9bdfba7c3..7c13f5558b71 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void)  	onoff_task = NULL;  #endif /* #ifdef CONFIG_HOTPLUG_CPU */  } -EXPORT_SYMBOL_GPL(torture_onoff_cleanup);  /*   * Print online/offline testing statistics. @@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void)  	}  	shuffler_task = NULL;  } -EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);  /*   * Variables for auto-shutdown.  This allows "lights out" torture runs diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..e08527f50d2a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -146,7 +146,7 @@ config FUNCTION_TRACER  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	select GLOB -	select TASKS_RCU if PREEMPT +	select TASKS_RCU if PREEMPTION  	help  	  Enable the kernel to trace every kernel function. This is done  	  by using a compiler feature to insert a small, 5-byte No-Operation @@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE  config PREEMPTIRQ_EVENTS  	bool "Enable trace events for preempt and irq disable/enable"  	select TRACE_IRQFLAGS -	select TRACE_PREEMPT_TOGGLE if PREEMPT +	select TRACE_PREEMPT_TOGGLE if PREEMPTION  	select GENERIC_TRACER  	default n  	help @@ -214,7 +214,7 @@ config PREEMPT_TRACER  	bool "Preemption-off Latency Tracer"  	default n  	depends on !ARCH_USES_GETTIMEOFFSET -	depends on PREEMPT +	depends on PREEMPTION  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP @@ -520,7 +520,8 @@ config BPF_EVENTS  	bool  	default y  	help -	  This allows the user to attach BPF programs to kprobe events. +	  This allows the user to attach BPF programs to kprobe, uprobe, and +	  tracepoint events.  config DYNAMIC_EVENTS  	def_bool n diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1255d14576..44bd08f2443b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -142,8 +142,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)  {  	int ret; +	ret = security_locked_down(LOCKDOWN_BPF_READ); +	if (ret < 0) +		goto out; +  	ret = probe_kernel_read(dst, unsafe_ptr, size);  	if (unlikely(ret < 0)) +out:  		memset(dst, 0, size);  	return ret; @@ -500,14 +505,17 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {  	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,  }; -static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); +static DEFINE_PER_CPU(int, bpf_event_output_nest_level); +struct bpf_nested_pt_regs { +	struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,  		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)  { -	struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); -	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); +	int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);  	struct perf_raw_frag frag = {  		.copy		= ctx_copy,  		.size		= ctx_size, @@ -522,12 +530,25 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,  			.data	= meta,  		},  	}; +	struct perf_sample_data *sd; +	struct pt_regs *regs; +	u64 ret; + +	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { +		ret = -EBUSY; +		goto out; +	} +	sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); +	regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);  	perf_fetch_caller_regs(regs);  	perf_sample_data_init(sd, 0, 0);  	sd->raw = &raw; -	return __bpf_perf_event_output(regs, map, flags, sd); +	ret = __bpf_perf_event_output(regs, map, flags, sd); +out: +	this_cpu_dec(bpf_event_output_nest_level); +	return ret;  }  BPF_CALL_0(bpf_get_current_task) @@ -569,6 +590,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,  {  	int ret; +	ret = security_locked_down(LOCKDOWN_BPF_READ); +	if (ret < 0) +		goto out; +  	/*  	 * The strncpy_from_unsafe() call will likely not fill the entire  	 * buffer, but that's okay in this circumstance as we're probing @@ -580,6 +605,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,  	 */  	ret = strncpy_from_unsafe(dst, unsafe_ptr, size);  	if (unlikely(ret < 0)) +out:  		memset(dst, 0, size);  	return ret; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,  	int index = task->curr_ret_stack;  	int i; -	if (ret != (unsigned long)return_to_handler) +	if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))  		return ret;  	if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,  {  	int task_idx; -	if (ret != (unsigned long)return_to_handler) +	if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))  		return ret;  	task_idx = task->curr_ret_stack; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9821a3374e9..62a50bf399d6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)  		 * synchornize_rcu_tasks() will wait for those tasks to  		 * execute and either schedule voluntarily or enter user space.  		 */ -		if (IS_ENABLED(CONFIG_PREEMPT)) +		if (IS_ENABLED(CONFIG_PREEMPTION))  			synchronize_rcu_tasks();   free_ops: @@ -6036,11 +6036,7 @@ clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)  {  	struct ftrace_func_entry *entry; -	if (ftrace_hash_empty(hash)) -		return; - -	entry = __ftrace_lookup_ip(hash, func->ip); - +	entry = ftrace_lookup_ip(hash, func->ip);  	/*  	 * Do not allow this rec to match again.  	 * Yeah, it may waste some memory, but will be removed diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0515a2096f90..0456e0a3dab1 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -6,22 +6,22 @@  /*   * Traverse the ftrace_global_list, invoking all entries.  The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list + * can use rcu_dereference_raw_check() is that elements removed from this list   * are simply leaked, so there is no need to interact with a grace-period - * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle + * mechanism.  The rcu_dereference_raw_check() calls are needed to handle   * concurrent insertions into the ftrace_global_list.   *   * Silly Alpha and silly pointer-speculation compiler optimizations!   */  #define do_for_each_ftrace_op(op, list)			\ -	op = rcu_dereference_raw_notrace(list);			\ +	op = rcu_dereference_raw_check(list);			\  	do  /*   * Optimized for just a single item in the list (as that is the normal case).   */  #define while_for_each_ftrace_op(op)				\ -	while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&	\ +	while (likely(op = rcu_dereference_raw_check((op)->next)) &&	\  	       unlikely((op) != &ftrace_list_end))  extern struct ftrace_ops __rcu *ftrace_ops_list; diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0564f6db0561..09b0b49f346e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -267,7 +267,7 @@ static void ring_buffer_producer(void)  		if (consumer && !(cnt % wakeup_interval))  			wake_up_process(consumer); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION  		/*  		 * If we are a non preempt kernel, the 10 second run will  		 * stop everything while it runs. Instead, we will call diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 563e80f9006a..252f79c435f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1854,7 +1854,7 @@ int __init register_tracer(struct tracer *type)  	return ret;  } -void tracing_reset(struct trace_buffer *buf, int cpu) +static void tracing_reset_cpu(struct trace_buffer *buf, int cpu)  {  	struct ring_buffer *buffer = buf->buffer; @@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event)  	preempt_disable_notrace(); -	export = rcu_dereference_raw_notrace(ftrace_exports_list); +	export = rcu_dereference_raw_check(ftrace_exports_list);  	while (export) {  		trace_process_export(export, event); -		export = rcu_dereference_raw_notrace(export->next); +		export = rcu_dereference_raw_check(export->next);  	}  	preempt_enable_notrace(); @@ -4251,7 +4251,7 @@ static int tracing_open(struct inode *inode, struct file *file)  		if (cpu == RING_BUFFER_ALL_CPUS)  			tracing_reset_online_cpus(trace_buf);  		else -			tracing_reset(trace_buf, cpu); +			tracing_reset_cpu(trace_buf, cpu);  	}  	if (file->f_mode & FMODE_READ) { @@ -4815,15 +4815,15 @@ static const char readme_msg[] =  #endif  #endif /* CONFIG_STACK_TRACER */  #ifdef CONFIG_DYNAMIC_EVENTS -	"  dynamic_events\t\t- Add/remove/show the generic dynamic events\n" +	"  dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n"  	"\t\t\t  Write into this file to define/undefine new trace events.\n"  #endif  #ifdef CONFIG_KPROBE_EVENTS -	"  kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" +	"  kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n"  	"\t\t\t  Write into this file to define/undefine new trace events.\n"  #endif  #ifdef CONFIG_UPROBE_EVENTS -	"  uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" +	"  uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"  	"\t\t\t  Write into this file to define/undefine new trace events.\n"  #endif  #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) @@ -4848,7 +4848,7 @@ static const char readme_msg[] =  #else  	"\t           $stack<index>, $stack, $retval, $comm,\n"  #endif -	"\t           +|-[u]<offset>(<fetcharg>)\n" +	"\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"  	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"  	"\t           b<bit-width>@<bit-offset>/<container-size>, ustring,\n"  	"\t           <type>\\[<array-size>\\]\n" @@ -6742,7 +6742,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  			if (iter->cpu_file == RING_BUFFER_ALL_CPUS)  				tracing_reset_online_cpus(&tr->max_buffer);  			else -				tracing_reset(&tr->max_buffer, iter->cpu_file); +				tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);  		}  		break;  	} diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 005f08629b8b..f801d154ff6a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -365,11 +365,11 @@ static inline struct trace_array *top_trace_array(void)  	__builtin_types_compatible_p(typeof(var), type *)  #undef IF_ASSIGN -#define IF_ASSIGN(var, entry, etype, id)		\ -	if (FTRACE_CMP_TYPE(var, etype)) {		\ -		var = (typeof(var))(entry);		\ -		WARN_ON(id && (entry)->type != id);	\ -		break;					\ +#define IF_ASSIGN(var, entry, etype, id)			\ +	if (FTRACE_CMP_TYPE(var, etype)) {			\ +		var = (typeof(var))(entry);			\ +		WARN_ON(id != 0 && (entry)->type != id);	\ +		break;						\  	}  /* Will cause compile errors if type is not found. */ @@ -677,7 +677,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)  int tracer_init(struct tracer *t, struct trace_array *tr);  int tracing_is_enabled(void); -void tracing_reset(struct trace_buffer *buf, int cpu);  void tracing_reset_online_cpus(struct trace_buffer *buf);  void tracing_reset_current(int cpu);  void tracing_reset_all_online_cpus(void); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index fa100ed3b4de..a41fed46c285 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -47,6 +47,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)  			return -EINVAL;  		event++;  	} +	argc--; argv++;  	p = strchr(event, '/');  	if (p) { @@ -61,10 +62,13 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)  	for_each_dyn_event_safe(pos, n) {  		if (type && type != pos->ops)  			continue; -		if (pos->ops->match(system, event, pos)) { -			ret = pos->ops->free(pos); +		if (!pos->ops->match(system, event, +				argc, (const char **)argv, pos)) +			continue; + +		ret = pos->ops->free(pos); +		if (ret)  			break; -		}  	}  	mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 8c334064e4d6..46898138d2df 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -31,8 +31,9 @@ struct dyn_event;   * @is_busy: Check whether given event is busy so that it can not be deleted.   *  Return true if it is busy, otherwides false.   * @free: Delete the given event. Return 0 if success, otherwides error. - * @match: Check whether given event and system name match this event. - *  Return true if it matches, otherwides false. + * @match: Check whether given event and system name match this event. The argc + *  and argv is used for exact match. Return true if it matches, otherwides + *  false.   *   * Except for @create, these methods are called under holding event_mutex.   */ @@ -43,7 +44,7 @@ struct dyn_event_operations {  	bool (*is_busy)(struct dyn_event *ev);  	int (*free)(struct dyn_event *ev);  	bool (*match)(const char *system, const char *event, -			struct dyn_event *ev); +		      int argc, const char **argv, struct dyn_event *ev);  };  /* Register new dyn_event type -- must be called at first */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 648930823b57..b89cdfe20bc1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,  	local_save_flags(fbuffer->flags);  	fbuffer->pc = preempt_count();  	/* -	 * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables +	 * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables  	 * preemption (adding one to the preempt_count). Since we are  	 * interested in the preempt_count at the time the tracepoint was  	 * hit, we need to subtract one to offset the increment.  	 */ -	if (IS_ENABLED(CONFIG_PREEMPT)) +	if (IS_ENABLED(CONFIG_PREEMPTION))  		fbuffer->pc--;  	fbuffer->trace_file = trace_file; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c773b8fb270c..c9a74f82b14a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,  		switch (*next) {  		case '(':					/* #2 */ -			if (top - op_stack > nr_parens) -				return ERR_PTR(-EINVAL); +			if (top - op_stack > nr_parens) { +				ret = -EINVAL; +				goto out_free; +			}  			*(++top) = invert;  			continue;  		case '!':					/* #3 */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca6b0dff60c5..9468bd8d44a2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -13,6 +13,10 @@  #include <linux/rculist.h>  #include <linux/tracefs.h> +/* for gfp flag names */ +#include <linux/trace_events.h> +#include <trace/events/mmflags.h> +  #include "tracing_map.h"  #include "trace.h"  #include "trace_dynevent.h" @@ -374,7 +378,7 @@ static int synth_event_show(struct seq_file *m, struct dyn_event *ev);  static int synth_event_release(struct dyn_event *ev);  static bool synth_event_is_busy(struct dyn_event *ev);  static bool synth_event_match(const char *system, const char *event, -			      struct dyn_event *ev); +			int argc, const char **argv, struct dyn_event *ev);  static struct dyn_event_operations synth_event_ops = {  	.create = synth_event_create, @@ -422,7 +426,7 @@ static bool synth_event_is_busy(struct dyn_event *ev)  }  static bool synth_event_match(const char *system, const char *event, -			      struct dyn_event *ev) +			int argc, const char **argv, struct dyn_event *ev)  {  	struct synth_event *sev = to_synth_event(ev); @@ -752,6 +756,8 @@ static int synth_field_size(char *type)  		size = sizeof(unsigned long);  	else if (strcmp(type, "pid_t") == 0)  		size = sizeof(pid_t); +	else if (strcmp(type, "gfp_t") == 0) +		size = sizeof(gfp_t);  	else if (synth_field_is_string(type))  		size = synth_field_string_size(type); @@ -792,6 +798,8 @@ static const char *synth_field_fmt(char *type)  		fmt = "%lu";  	else if (strcmp(type, "pid_t") == 0)  		fmt = "%d"; +	else if (strcmp(type, "gfp_t") == 0) +		fmt = "%x";  	else if (synth_field_is_string(type))  		fmt = "%s"; @@ -834,9 +842,20 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,  					 i == se->n_fields - 1 ? "" : " ");  			n_u64 += STR_VAR_LEN_MAX / sizeof(u64);  		} else { +			struct trace_print_flags __flags[] = { +			    __def_gfpflag_names, {-1, NULL} }; +  			trace_seq_printf(s, print_fmt, se->fields[i]->name,  					 entry->fields[n_u64],  					 i == se->n_fields - 1 ? "" : " "); + +			if (strcmp(se->fields[i]->type, "gfp_t") == 0) { +				trace_seq_puts(s, " ("); +				trace_print_flags_seq(s, "|", +						      entry->fields[n_u64], +						      __flags); +				trace_seq_putc(s, ')'); +			}  			n_u64++;  		}  	} @@ -2785,6 +2804,8 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,  		return NULL;  	} +	alias->var_ref_idx = var_ref->var_ref_idx; +  	return alias;  } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9d483ad9bb6c..324ffbea3556 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -11,6 +11,7 @@  #include <linux/uaccess.h>  #include <linux/rculist.h>  #include <linux/error-injection.h> +#include <linux/security.h>  #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */ @@ -39,7 +40,7 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);  static int trace_kprobe_release(struct dyn_event *ev);  static bool trace_kprobe_is_busy(struct dyn_event *ev);  static bool trace_kprobe_match(const char *system, const char *event, -			       struct dyn_event *ev); +			int argc, const char **argv, struct dyn_event *ev);  static struct dyn_event_operations trace_kprobe_ops = {  	.create = trace_kprobe_create, @@ -137,13 +138,36 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev)  	return trace_probe_is_enabled(&tk->tp);  } +static bool trace_kprobe_match_command_head(struct trace_kprobe *tk, +					    int argc, const char **argv) +{ +	char buf[MAX_ARGSTR_LEN + 1]; + +	if (!argc) +		return true; + +	if (!tk->symbol) +		snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr); +	else if (tk->rp.kp.offset) +		snprintf(buf, sizeof(buf), "%s+%u", +			 trace_kprobe_symbol(tk), tk->rp.kp.offset); +	else +		snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk)); +	if (strcmp(buf, argv[0])) +		return false; +	argc--; argv++; + +	return trace_probe_match_command_args(&tk->tp, argc, argv); +} +  static bool trace_kprobe_match(const char *system, const char *event, -			       struct dyn_event *ev) +			int argc, const char **argv, struct dyn_event *ev)  {  	struct trace_kprobe *tk = to_trace_kprobe(ev);  	return strcmp(trace_probe_name(&tk->tp), event) == 0 && -	    (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0); +	    (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && +	    trace_kprobe_match_command_head(tk, argc, argv);  }  static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -180,20 +204,33 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk)  	return addr;  } +static nokprobe_inline struct trace_kprobe * +trace_kprobe_primary_from_call(struct trace_event_call *call) +{ +	struct trace_probe *tp; + +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return NULL; + +	return container_of(tp, struct trace_kprobe, tp); +} +  bool trace_kprobe_on_func_entry(struct trace_event_call *call)  { -	struct trace_kprobe *tk = (struct trace_kprobe *)call->data; +	struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); -	return kprobe_on_func_entry(tk->rp.kp.addr, +	return tk ? kprobe_on_func_entry(tk->rp.kp.addr,  			tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, -			tk->rp.kp.addr ? 0 : tk->rp.kp.offset); +			tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false;  }  bool trace_kprobe_error_injectable(struct trace_event_call *call)  { -	struct trace_kprobe *tk = (struct trace_kprobe *)call->data; +	struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); -	return within_error_injection_list(trace_kprobe_address(tk)); +	return tk ? within_error_injection_list(trace_kprobe_address(tk)) : +	       false;  }  static int register_kprobe_event(struct trace_kprobe *tk); @@ -291,32 +328,68 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)  	return ret;  } +static void __disable_trace_kprobe(struct trace_probe *tp) +{ +	struct trace_probe *pos; +	struct trace_kprobe *tk; + +	list_for_each_entry(pos, trace_probe_probe_list(tp), list) { +		tk = container_of(pos, struct trace_kprobe, tp); +		if (!trace_kprobe_is_registered(tk)) +			continue; +		if (trace_kprobe_is_return(tk)) +			disable_kretprobe(&tk->rp); +		else +			disable_kprobe(&tk->rp.kp); +	} +} +  /*   * Enable trace_probe   * if the file is NULL, enable "perf" handler, or enable "trace" handler.   */ -static int -enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int enable_trace_kprobe(struct trace_event_call *call, +				struct trace_event_file *file)  { -	bool enabled = trace_probe_is_enabled(&tk->tp); +	struct trace_probe *pos, *tp; +	struct trace_kprobe *tk; +	bool enabled;  	int ret = 0; +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return -ENODEV; +	enabled = trace_probe_is_enabled(tp); + +	/* This also changes "enabled" state */  	if (file) { -		ret = trace_probe_add_file(&tk->tp, file); +		ret = trace_probe_add_file(tp, file);  		if (ret)  			return ret;  	} else -		trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE); +		trace_probe_set_flag(tp, TP_FLAG_PROFILE);  	if (enabled)  		return 0; -	ret = __enable_trace_kprobe(tk); +	list_for_each_entry(pos, trace_probe_probe_list(tp), list) { +		tk = container_of(pos, struct trace_kprobe, tp); +		if (trace_kprobe_has_gone(tk)) +			continue; +		ret = __enable_trace_kprobe(tk); +		if (ret) +			break; +		enabled = true; +	} +  	if (ret) { +		/* Failed to enable one of them. Roll back all */ +		if (enabled) +			__disable_trace_kprobe(tp);  		if (file) -			trace_probe_remove_file(&tk->tp, file); +			trace_probe_remove_file(tp, file);  		else -			trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE); +			trace_probe_clear_flag(tp, TP_FLAG_PROFILE);  	}  	return ret; @@ -326,11 +399,14 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)   * Disable trace_probe   * if the file is NULL, disable "perf" handler, or disable "trace" handler.   */ -static int -disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int disable_trace_kprobe(struct trace_event_call *call, +				struct trace_event_file *file)  { -	struct trace_probe *tp = &tk->tp; -	int ret = 0; +	struct trace_probe *tp; + +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return -ENODEV;  	if (file) {  		if (!trace_probe_get_file_link(tp, file)) @@ -341,12 +417,8 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)  	} else  		trace_probe_clear_flag(tp, TP_FLAG_PROFILE); -	if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) { -		if (trace_kprobe_is_return(tk)) -			disable_kretprobe(&tk->rp); -		else -			disable_kprobe(&tk->rp.kp); -	} +	if (!trace_probe_is_enabled(tp)) +		__disable_trace_kprobe(tp);   out:  	if (file) @@ -358,7 +430,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)  		 */  		trace_probe_remove_file(tp, file); -	return ret; +	return 0;  }  #if defined(CONFIG_KPROBES_ON_FTRACE) && \ @@ -389,6 +461,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)  {  	int i, ret; +	ret = security_locked_down(LOCKDOWN_KPROBES); +	if (ret) +		return ret; +  	if (trace_kprobe_is_registered(tk))  		return -EINVAL; @@ -437,6 +513,10 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk)  /* Unregister a trace_probe and probe_event */  static int unregister_trace_kprobe(struct trace_kprobe *tk)  { +	/* If other probes are on the event, just unregister kprobe */ +	if (trace_probe_has_sibling(&tk->tp)) +		goto unreg; +  	/* Enabled event can not be unregistered */  	if (trace_probe_is_enabled(&tk->tp))  		return -EBUSY; @@ -445,12 +525,82 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)  	if (unregister_kprobe_event(tk))  		return -EBUSY; +unreg:  	__unregister_trace_kprobe(tk);  	dyn_event_remove(&tk->devent); +	trace_probe_unlink(&tk->tp);  	return 0;  } +static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, +					 struct trace_kprobe *comp) +{ +	struct trace_probe_event *tpe = orig->tp.event; +	struct trace_probe *pos; +	int i; + +	list_for_each_entry(pos, &tpe->probes, list) { +		orig = container_of(pos, struct trace_kprobe, tp); +		if (strcmp(trace_kprobe_symbol(orig), +			   trace_kprobe_symbol(comp)) || +		    trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) +			continue; + +		/* +		 * trace_probe_compare_arg_type() ensured that nr_args and +		 * each argument name and type are same. Let's compare comm. +		 */ +		for (i = 0; i < orig->tp.nr_args; i++) { +			if (strcmp(orig->tp.args[i].comm, +				   comp->tp.args[i].comm)) +				break; +		} + +		if (i == orig->tp.nr_args) +			return true; +	} + +	return false; +} + +static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) +{ +	int ret; + +	ret = trace_probe_compare_arg_type(&tk->tp, &to->tp); +	if (ret) { +		/* Note that argument starts index = 2 */ +		trace_probe_log_set_index(ret + 1); +		trace_probe_log_err(0, DIFF_ARG_TYPE); +		return -EEXIST; +	} +	if (trace_kprobe_has_same_kprobe(to, tk)) { +		trace_probe_log_set_index(0); +		trace_probe_log_err(0, SAME_PROBE); +		return -EEXIST; +	} + +	/* Append to existing event */ +	ret = trace_probe_append(&tk->tp, &to->tp); +	if (ret) +		return ret; + +	/* Register k*probe */ +	ret = __register_trace_kprobe(tk); +	if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { +		pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); +		ret = 0; +	} + +	if (ret) +		trace_probe_unlink(&tk->tp); +	else +		dyn_event_add(&tk->devent); + +	return ret; +} +  /* Register a trace_probe and probe_event */  static int register_trace_kprobe(struct trace_kprobe *tk)  { @@ -459,14 +609,17 @@ static int register_trace_kprobe(struct trace_kprobe *tk)  	mutex_lock(&event_mutex); -	/* Delete old (same name) event if exist */  	old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),  				   trace_probe_group_name(&tk->tp));  	if (old_tk) { -		ret = unregister_trace_kprobe(old_tk); -		if (ret < 0) -			goto end; -		free_trace_kprobe(old_tk); +		if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { +			trace_probe_log_set_index(0); +			trace_probe_log_err(0, DIFF_PROBE_TYPE); +			ret = -EEXIST; +		} else { +			ret = append_trace_kprobe(tk, old_tk); +		} +		goto end;  	}  	/* Register new event */ @@ -700,7 +853,7 @@ static int trace_kprobe_create(int argc, const char *argv[])  			trace_probe_log_err(0, BAD_INSN_BNDRY);  		else if (ret == -ENOENT)  			trace_probe_log_err(0, BAD_PROBE_ADDR); -		else if (ret != -ENOMEM) +		else if (ret != -ENOMEM && ret != -EEXIST)  			trace_probe_log_err(0, FAIL_REG_PROBE);  		goto error;  	} @@ -965,6 +1118,9 @@ retry:  	case FETCH_OP_COMM:  		val = (unsigned long)current->comm;  		break; +	case FETCH_OP_DATA: +		val = (unsigned long)code->data; +		break;  #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API  	case FETCH_OP_ARG:  		val = regs_get_kernel_argument(regs, code->param); @@ -1089,7 +1245,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags,  	struct trace_probe *tp;  	field = (struct kprobe_trace_entry_head *)iter->ent; -	tp = container_of(event, struct trace_probe, call.event); +	tp = trace_probe_primary_from_call( +		container_of(event, struct trace_event_call, event)); +	if (WARN_ON_ONCE(!tp)) +		goto out;  	trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1116,7 +1275,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,  	struct trace_probe *tp;  	field = (struct kretprobe_trace_entry_head *)iter->ent; -	tp = container_of(event, struct trace_probe, call.event); +	tp = trace_probe_primary_from_call( +		container_of(event, struct trace_event_call, event)); +	if (WARN_ON_ONCE(!tp)) +		goto out;  	trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1145,23 +1307,31 @@ static int kprobe_event_define_fields(struct trace_event_call *event_call)  {  	int ret;  	struct kprobe_trace_entry_head field; -	struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; +	struct trace_probe *tp; + +	tp = trace_probe_primary_from_call(event_call); +	if (WARN_ON_ONCE(!tp)) +		return -ENOENT;  	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); -	return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); +	return traceprobe_define_arg_fields(event_call, sizeof(field), tp);  }  static int kretprobe_event_define_fields(struct trace_event_call *event_call)  {  	int ret;  	struct kretprobe_trace_entry_head field; -	struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; +	struct trace_probe *tp; + +	tp = trace_probe_primary_from_call(event_call); +	if (WARN_ON_ONCE(!tp)) +		return -ENOENT;  	DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);  	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); -	return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); +	return traceprobe_define_arg_fields(event_call, sizeof(field), tp);  }  #ifdef CONFIG_PERF_EVENTS @@ -1289,20 +1459,19 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,  static int kprobe_register(struct trace_event_call *event,  			   enum trace_reg type, void *data)  { -	struct trace_kprobe *tk = (struct trace_kprobe *)event->data;  	struct trace_event_file *file = data;  	switch (type) {  	case TRACE_REG_REGISTER: -		return enable_trace_kprobe(tk, file); +		return enable_trace_kprobe(event, file);  	case TRACE_REG_UNREGISTER: -		return disable_trace_kprobe(tk, file); +		return disable_trace_kprobe(event, file);  #ifdef CONFIG_PERF_EVENTS  	case TRACE_REG_PERF_REGISTER: -		return enable_trace_kprobe(tk, NULL); +		return enable_trace_kprobe(event, NULL);  	case TRACE_REG_PERF_UNREGISTER: -		return disable_trace_kprobe(tk, NULL); +		return disable_trace_kprobe(event, NULL);  	case TRACE_REG_PERF_OPEN:  	case TRACE_REG_PERF_CLOSE:  	case TRACE_REG_PERF_ADD: @@ -1369,7 +1538,6 @@ static inline void init_trace_event_call(struct trace_kprobe *tk)  	call->flags = TRACE_EVENT_FL_KPROBE;  	call->class->reg = kprobe_register; -	call->data = tk;  }  static int register_kprobe_event(struct trace_kprobe *tk) @@ -1432,7 +1600,9 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)  {  	struct trace_kprobe *tk; -	tk = container_of(event_call, struct trace_kprobe, tp.call); +	tk = trace_kprobe_primary_from_call(event_call); +	if (unlikely(!tk)) +		return;  	if (trace_probe_is_enabled(&tk->tp)) {  		WARN_ON(1); @@ -1577,7 +1747,8 @@ static __init int kprobe_trace_self_tests_init(void)  				pr_warn("error on getting probe file.\n");  				warn++;  			} else -				enable_trace_kprobe(tk, file); +				enable_trace_kprobe( +					trace_probe_event_call(&tk->tp), file);  		}  	} @@ -1598,7 +1769,8 @@ static __init int kprobe_trace_self_tests_init(void)  				pr_warn("error on getting probe file.\n");  				warn++;  			} else -				enable_trace_kprobe(tk, file); +				enable_trace_kprobe( +					trace_probe_event_call(&tk->tp), file);  		}  	} @@ -1631,7 +1803,8 @@ static __init int kprobe_trace_self_tests_init(void)  			pr_warn("error on getting probe file.\n");  			warn++;  		} else -			disable_trace_kprobe(tk, file); +			disable_trace_kprobe( +				trace_probe_event_call(&tk->tp), file);  	}  	tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); @@ -1649,7 +1822,8 @@ static __init int kprobe_trace_self_tests_init(void)  			pr_warn("error on getting probe file.\n");  			warn++;  		} else -			disable_trace_kprobe(tk, file); +			disable_trace_kprobe( +				trace_probe_event_call(&tk->tp), file);  	}  	ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cab4a5398f1d..d54ce252b05a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -219,10 +219,10 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,  {  	int i;  	const char *ret = trace_seq_buffer_ptr(p); +	const char *fmt = concatenate ? "%*phN" : "%*ph"; -	for (i = 0; i < buf_len; i++) -		trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", -				 buf[i]); +	for (i = 0; i < buf_len; i += 16) +		trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]);  	trace_seq_putc(p, 0);  	return ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fb6bfbc5bf86..905b10af5d5c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type)  	if (!command)  		return; +	if (trace_probe_log.index >= trace_probe_log.argc) { +		/** +		 * Set the error position is next to the last arg + space. +		 * Note that len includes the terminal null and the cursor +		 * appaers at pos + 1. +		 */ +		pos = len; +		offset = 0; +	} +  	/* And make a command string from argv array */  	p = command;  	for (i = 0; i < trace_probe_log.argc; i++) { @@ -316,6 +326,29 @@ inval_var:  	return -EINVAL;  } +static int str_to_immediate(char *str, unsigned long *imm) +{ +	if (isdigit(str[0])) +		return kstrtoul(str, 0, imm); +	else if (str[0] == '-') +		return kstrtol(str, 0, (long *)imm); +	else if (str[0] == '+') +		return kstrtol(str + 1, 0, (long *)imm); +	return -EINVAL; +} + +static int __parse_imm_string(char *str, char **pbuf, int offs) +{ +	size_t len = strlen(str); + +	if (str[len - 1] != '"') { +		trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); +		return -EINVAL; +	} +	*pbuf = kstrndup(str, len - 1, GFP_KERNEL); +	return 0; +} +  /* Recursive argument parser */  static int  parse_probe_arg(char *arg, const struct fetch_type *type, @@ -430,7 +463,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,  			ret = parse_probe_arg(arg, t2, &code, end, flags, offs);  			if (ret)  				break; -			if (code->op == FETCH_OP_COMM) { +			if (code->op == FETCH_OP_COMM || +			    code->op == FETCH_OP_DATA) {  				trace_probe_log_err(offs, COMM_CANT_DEREF);  				return -EINVAL;  			} @@ -444,6 +478,21 @@ parse_probe_arg(char *arg, const struct fetch_type *type,  			code->offset = offset;  		}  		break; +	case '\\':	/* Immediate value */ +		if (arg[1] == '"') {	/* Immediate string */ +			ret = __parse_imm_string(arg + 2, &tmp, offs + 2); +			if (ret) +				break; +			code->op = FETCH_OP_DATA; +			code->data = tmp; +		} else { +			ret = str_to_immediate(arg + 1, &code->immediate); +			if (ret) +				trace_probe_log_err(offs + 1, BAD_IMM); +			else +				code->op = FETCH_OP_IMM; +		} +		break;  	}  	if (!ret && code->op == FETCH_OP_NOP) {  		/* Parsed, but do not find fetch method */ @@ -542,8 +591,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  		}  	} -	/* Since $comm can not be dereferred, we can find $comm by strcmp */ -	if (strcmp(arg, "$comm") == 0) { +	/* +	 * Since $comm and immediate string can not be dereferred, +	 * we can find those by strcmp. +	 */ +	if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {  		/* The type of $comm must be "string", and not an array. */  		if (parg->count || (t && strcmp(t, "string")))  			return -EINVAL; @@ -580,7 +632,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  	if (!strcmp(parg->type->name, "string") ||  	    !strcmp(parg->type->name, "ustring")) {  		if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && -		    code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { +		    code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && +		    code->op != FETCH_OP_DATA) {  			trace_probe_log_err(offset + (t ? (t - arg) : 0),  					    BAD_STRING);  			ret = -EINVAL; @@ -589,9 +642,10 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  		if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) ||  		     parg->count) {  			/* -			 * IMM and COMM is pointing actual address, those must -			 * be kept, and if parg->count != 0, this is an array -			 * of string pointers instead of string address itself. +			 * IMM, DATA and COMM is pointing actual address, those +			 * must be kept, and if parg->count != 0, this is an +			 * array of string pointers instead of string address +			 * itself.  			 */  			code++;  			if (code->op != FETCH_OP_NOP) { @@ -665,7 +719,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  fail:  	if (ret) {  		for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) -			if (code->op == FETCH_NOP_SYMBOL) +			if (code->op == FETCH_NOP_SYMBOL || +			    code->op == FETCH_OP_DATA)  				kfree(code->data);  	}  	kfree(tmp); @@ -736,7 +791,8 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)  	struct fetch_insn *code = arg->code;  	while (code && code->op != FETCH_OP_END) { -		if (code->op == FETCH_NOP_SYMBOL) +		if (code->op == FETCH_NOP_SYMBOL || +		    code->op == FETCH_OP_DATA)  			kfree(code->data);  		code++;  	} @@ -886,44 +942,85 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call,  	return 0;  } +static void trace_probe_event_free(struct trace_probe_event *tpe) +{ +	kfree(tpe->class.system); +	kfree(tpe->call.name); +	kfree(tpe->call.print_fmt); +	kfree(tpe); +} + +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to) +{ +	if (trace_probe_has_sibling(tp)) +		return -EBUSY; + +	list_del_init(&tp->list); +	trace_probe_event_free(tp->event); + +	tp->event = to->event; +	list_add_tail(&tp->list, trace_probe_probe_list(to)); + +	return 0; +} + +void trace_probe_unlink(struct trace_probe *tp) +{ +	list_del_init(&tp->list); +	if (list_empty(trace_probe_probe_list(tp))) +		trace_probe_event_free(tp->event); +	tp->event = NULL; +}  void trace_probe_cleanup(struct trace_probe *tp)  { -	struct trace_event_call *call = trace_probe_event_call(tp);  	int i;  	for (i = 0; i < tp->nr_args; i++)  		traceprobe_free_probe_arg(&tp->args[i]); -	if (call->class) -		kfree(call->class->system); -	kfree(call->name); -	kfree(call->print_fmt); +	if (tp->event) +		trace_probe_unlink(tp);  }  int trace_probe_init(struct trace_probe *tp, const char *event,  		     const char *group)  { -	struct trace_event_call *call = trace_probe_event_call(tp); +	struct trace_event_call *call; +	int ret = 0;  	if (!event || !group)  		return -EINVAL; -	call->class = &tp->class; -	call->name = kstrdup(event, GFP_KERNEL); -	if (!call->name) +	tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); +	if (!tp->event)  		return -ENOMEM; -	tp->class.system = kstrdup(group, GFP_KERNEL); -	if (!tp->class.system) { -		kfree(call->name); -		call->name = NULL; -		return -ENOMEM; +	INIT_LIST_HEAD(&tp->event->files); +	INIT_LIST_HEAD(&tp->event->class.fields); +	INIT_LIST_HEAD(&tp->event->probes); +	INIT_LIST_HEAD(&tp->list); +	list_add(&tp->event->probes, &tp->list); + +	call = trace_probe_event_call(tp); +	call->class = &tp->event->class; +	call->name = kstrdup(event, GFP_KERNEL); +	if (!call->name) { +		ret = -ENOMEM; +		goto error; +	} + +	tp->event->class.system = kstrdup(group, GFP_KERNEL); +	if (!tp->event->class.system) { +		ret = -ENOMEM; +		goto error;  	} -	INIT_LIST_HEAD(&tp->files); -	INIT_LIST_HEAD(&tp->class.fields);  	return 0; + +error: +	trace_probe_cleanup(tp); +	return ret;  }  int trace_probe_register_event_call(struct trace_probe *tp) @@ -952,7 +1049,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file)  	link->file = file;  	INIT_LIST_HEAD(&link->list); -	list_add_tail_rcu(&link->list, &tp->files); +	list_add_tail_rcu(&link->list, &tp->event->files);  	trace_probe_set_flag(tp, TP_FLAG_TRACE);  	return 0;  } @@ -983,8 +1080,51 @@ int trace_probe_remove_file(struct trace_probe *tp,  	synchronize_rcu();  	kfree(link); -	if (list_empty(&tp->files)) +	if (list_empty(&tp->event->files))  		trace_probe_clear_flag(tp, TP_FLAG_TRACE);  	return 0;  } + +/* + * Return the smallest index of different type argument (start from 1). + * If all argument types and name are same, return 0. + */ +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) +{ +	int i; + +	/* In case of more arguments */ +	if (a->nr_args < b->nr_args) +		return a->nr_args + 1; +	if (a->nr_args > b->nr_args) +		return b->nr_args + 1; + +	for (i = 0; i < a->nr_args; i++) { +		if ((b->nr_args <= i) || +		    ((a->args[i].type != b->args[i].type) || +		     (a->args[i].count != b->args[i].count) || +		     strcmp(a->args[i].name, b->args[i].name))) +			return i + 1; +	} + +	return 0; +} + +bool trace_probe_match_command_args(struct trace_probe *tp, +				    int argc, const char **argv) +{ +	char buf[MAX_ARGSTR_LEN + 1]; +	int i; + +	if (tp->nr_args < argc) +		return false; + +	for (i = 0; i < argc; i++) { +		snprintf(buf, sizeof(buf), "%s=%s", +			 tp->args[i].name, tp->args[i].comm); +		if (strcmp(buf, argv[i])) +			return false; +	} +	return true; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index d1714820efe1..4ee703728aec 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -89,6 +89,7 @@ enum fetch_op {  	FETCH_OP_COMM,		/* Current comm */  	FETCH_OP_ARG,		/* Function argument : .param */  	FETCH_OP_FOFFS,		/* File offset: .immediate */ +	FETCH_OP_DATA,		/* Allocated data: .data */  	// Stage 2 (dereference) op  	FETCH_OP_DEREF,		/* Dereference: .offset */  	FETCH_OP_UDEREF,	/* User-space Dereference: .offset */ @@ -222,11 +223,18 @@ struct probe_arg {  	const struct fetch_type	*type;	/* Type of this argument */  }; -struct trace_probe { +/* Event call and class holder */ +struct trace_probe_event {  	unsigned int			flags;	/* For TP_FLAG_* */  	struct trace_event_class	class;  	struct trace_event_call		call;  	struct list_head 		files; +	struct list_head		probes; +}; + +struct trace_probe { +	struct list_head		list; +	struct trace_probe_event	*event;  	ssize_t				size;	/* trace entry size */  	unsigned int			nr_args;  	struct probe_arg		args[]; @@ -240,19 +248,19 @@ struct event_file_link {  static inline bool trace_probe_test_flag(struct trace_probe *tp,  					 unsigned int flag)  { -	return !!(tp->flags & flag); +	return !!(tp->event->flags & flag);  }  static inline void trace_probe_set_flag(struct trace_probe *tp,  					unsigned int flag)  { -	tp->flags |= flag; +	tp->event->flags |= flag;  }  static inline void trace_probe_clear_flag(struct trace_probe *tp,  					  unsigned int flag)  { -	tp->flags &= ~flag; +	tp->event->flags &= ~flag;  }  static inline bool trace_probe_is_enabled(struct trace_probe *tp) @@ -262,45 +270,76 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp)  static inline const char *trace_probe_name(struct trace_probe *tp)  { -	return trace_event_name(&tp->call); +	return trace_event_name(&tp->event->call);  }  static inline const char *trace_probe_group_name(struct trace_probe *tp)  { -	return tp->call.class->system; +	return tp->event->call.class->system;  }  static inline struct trace_event_call *  	trace_probe_event_call(struct trace_probe *tp)  { -	return &tp->call; +	return &tp->event->call; +} + +static inline struct trace_probe_event * +trace_probe_event_from_call(struct trace_event_call *event_call) +{ +	return container_of(event_call, struct trace_probe_event, call); +} + +static inline struct trace_probe * +trace_probe_primary_from_call(struct trace_event_call *call) +{ +	struct trace_probe_event *tpe = trace_probe_event_from_call(call); + +	return list_first_entry(&tpe->probes, struct trace_probe, list); +} + +static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) +{ +	return &tp->event->probes; +} + +static inline bool trace_probe_has_sibling(struct trace_probe *tp) +{ +	struct list_head *list = trace_probe_probe_list(tp); + +	return !list_empty(list) && !list_is_singular(list);  }  static inline int trace_probe_unregister_event_call(struct trace_probe *tp)  {  	/* tp->event is unregistered in trace_remove_event_call() */ -	return trace_remove_event_call(&tp->call); +	return trace_remove_event_call(&tp->event->call);  }  static inline bool trace_probe_has_single_file(struct trace_probe *tp)  { -	return !!list_is_singular(&tp->files); +	return !!list_is_singular(&tp->event->files);  }  int trace_probe_init(struct trace_probe *tp, const char *event,  		     const char *group);  void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); +void trace_probe_unlink(struct trace_probe *tp);  int trace_probe_register_event_call(struct trace_probe *tp);  int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file);  int trace_probe_remove_file(struct trace_probe *tp,  			    struct trace_event_file *file);  struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp,  						struct trace_event_file *file); +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); +bool trace_probe_match_command_args(struct trace_probe *tp, +				    int argc, const char **argv);  #define trace_probe_for_each_link(pos, tp)	\ -	list_for_each_entry(pos, &(tp)->files, list) +	list_for_each_entry(pos, &(tp)->event->files, list)  #define trace_probe_for_each_link_rcu(pos, tp)	\ -	list_for_each_entry_rcu(pos, &(tp)->files, list) +	list_for_each_entry_rcu(pos, &(tp)->event->files, list)  /* Check the name is good for event/group/fields */  static inline bool is_good_name(const char *name) @@ -370,6 +409,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,  	C(BAD_VAR,		"Invalid $-valiable specified"),	\  	C(BAD_REG_NAME,		"Invalid register name"),		\  	C(BAD_MEM_ADDR,		"Invalid memory address"),		\ +	C(BAD_IMM,		"Invalid immediate value"),		\ +	C(IMMSTR_NO_CLOSE,	"String is not closed with '\"'"),	\  	C(FILE_ON_KPROBE,	"File offset is not available with kprobe"), \  	C(BAD_FILE_OFFS,	"Invalid file offset value"),		\  	C(SYM_ON_UPROBE,	"Symbol is not available with uprobe"),	\ @@ -393,7 +434,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,  	C(ARG_TOO_LONG,		"Argument expression is too long"),	\  	C(NO_ARG_BODY,		"No argument expression"),		\  	C(BAD_INSN_BNDRY,	"Probe point is not an instruction boundary"),\ -	C(FAIL_REG_PROBE,	"Failed to register probe event"), +	C(FAIL_REG_PROBE,	"Failed to register probe event"),\ +	C(DIFF_PROBE_TYPE,	"Probe type is different from existing probe"),\ +	C(DIFF_ARG_TYPE,	"Argument type or name is different from existing probe"),\ +	C(SAME_PROBE,		"There is already the exact same probe event"),  #undef C  #define C(a, b)		TP_ERR_##a diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 743b2b520d34..5e43b9664eca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p)  	else  		tracing_dl = 0; -	wakeup_task = p; -	get_task_struct(wakeup_task); +	wakeup_task = get_task_struct(p);  	local_save_flags(flags); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5d16f73898db..ec9a34a97129 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -53,6 +53,104 @@ static void print_max_stack(void)  	}  } +/* + * The stack tracer looks for a maximum stack at each call from a function. It + * registers a callback from ftrace, and in that callback it examines the stack + * size. It determines the stack size from the variable passed in, which is the + * address of a local variable in the stack_trace_call() callback function. + * The stack size is calculated by the address of the local variable to the top + * of the current stack. If that size is smaller than the currently saved max + * stack size, nothing more is done. + * + * If the size of the stack is greater than the maximum recorded size, then the + * following algorithm takes place. + * + * For architectures (like x86) that store the function's return address before + * saving the function's local variables, the stack will look something like + * this: + * + *   [ top of stack ] + *    0: sys call entry frame + *   10: return addr to entry code + *   11: start of sys_foo frame + *   20: return addr to sys_foo + *   21: start of kernel_func_bar frame + *   30: return addr to kernel_func_bar + *   31: [ do trace stack here ] + * + * The save_stack_trace() is called returning all the functions it finds in the + * current stack. Which would be (from the bottom of the stack to the top): + * + *   return addr to kernel_func_bar + *   return addr to sys_foo + *   return addr to entry code + * + * Now to figure out how much each of these functions' local variable size is, + * a search of the stack is made to find these values. When a match is made, it + * is added to the stack_dump_trace[] array. The offset into the stack is saved + * in the stack_trace_index[] array. The above example would show: + * + *        stack_dump_trace[]        |   stack_trace_index[] + *        ------------------        +   ------------------- + *  return addr to kernel_func_bar  |          30 + *  return addr to sys_foo          |          20 + *  return addr to entry            |          10 + * + * The print_max_stack() function above, uses these values to print the size of + * each function's portion of the stack. + * + *  for (i = 0; i < nr_entries; i++) { + *     size = i == nr_entries - 1 ? stack_trace_index[i] : + *                    stack_trace_index[i] - stack_trace_index[i+1] + *     print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); + *  } + * + * The above shows + * + *     depth size  location + *     ----- ----  -------- + *  0    30   10   kernel_func_bar + *  1    20   10   sys_foo + *  2    10   10   entry code + * + * Now for architectures that might save the return address after the functions + * local variables (saving the link register before calling nested functions), + * this will cause the stack to look a little different: + * + * [ top of stack ] + *  0: sys call entry frame + * 10: start of sys_foo_frame + * 19: return addr to entry code << lr saved before calling kernel_func_bar + * 20: start of kernel_func_bar frame + * 29: return addr to sys_foo_frame << lr saved before calling next function + * 30: [ do trace stack here ] + * + * Although the functions returned by save_stack_trace() may be the same, the + * placement in the stack will be different. Using the same algorithm as above + * would yield: + * + *        stack_dump_trace[]        |   stack_trace_index[] + *        ------------------        +   ------------------- + *  return addr to kernel_func_bar  |          30 + *  return addr to sys_foo          |          29 + *  return addr to entry            |          19 + * + * Where the mapping is off by one: + * + *   kernel_func_bar stack frame size is 29 - 19 not 30 - 29! + * + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the + * values in stack_trace_index[] are shifted by one to and the number of + * stack trace entries is decremented by one. + * + *        stack_dump_trace[]        |   stack_trace_index[] + *        ------------------        +   ------------------- + *  return addr to kernel_func_bar  |          29 + *  return addr to sys_foo          |          19 + * + * Although the entry function is not displayed, the first function (sys_foo) + * will still include the stack size of it. + */  static void check_stack(unsigned long ip, unsigned long *stack)  {  	unsigned long this_size, flags; unsigned long *p, *top, *start; @@ -158,6 +256,20 @@ static void check_stack(unsigned long ip, unsigned long *stack)  			i++;  	} +#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER +	/* +	 * Some archs will store the link register before calling +	 * nested functions. This means the saved return address +	 * comes after the local storage, and we need to shift +	 * for that. +	 */ +	if (x > 1) { +		memmove(&stack_trace_index[0], &stack_trace_index[1], +			sizeof(stack_trace_index[0]) * (x - 1)); +		x--; +	} +#endif +  	stack_trace_nr_entries = x;  	if (task_stack_end_corrupted(current)) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1ceedb9146b1..dd884341f5c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -44,7 +44,7 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);  static int trace_uprobe_release(struct dyn_event *ev);  static bool trace_uprobe_is_busy(struct dyn_event *ev);  static bool trace_uprobe_match(const char *system, const char *event, -			       struct dyn_event *ev); +			int argc, const char **argv, struct dyn_event *ev);  static struct dyn_event_operations trace_uprobe_ops = {  	.create = trace_uprobe_create, @@ -248,6 +248,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,  	case FETCH_OP_COMM:  		val = FETCH_TOKEN_COMM;  		break; +	case FETCH_OP_DATA: +		val = (unsigned long)code->data; +		break;  	case FETCH_OP_FOFFS:  		val = translate_user_vaddr(code->immediate);  		break; @@ -284,13 +287,54 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev)  	return trace_probe_is_enabled(&tu->tp);  } +static bool trace_uprobe_match_command_head(struct trace_uprobe *tu, +					    int argc, const char **argv) +{ +	char buf[MAX_ARGSTR_LEN + 1]; +	int len; + +	if (!argc) +		return true; + +	len = strlen(tu->filename); +	if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':') +		return false; + +	if (tu->ref_ctr_offset == 0) +		snprintf(buf, sizeof(buf), "0x%0*lx", +				(int)(sizeof(void *) * 2), tu->offset); +	else +		snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)", +				(int)(sizeof(void *) * 2), tu->offset, +				tu->ref_ctr_offset); +	if (strcmp(buf, &argv[0][len + 1])) +		return false; + +	argc--; argv++; + +	return trace_probe_match_command_args(&tu->tp, argc, argv); +} +  static bool trace_uprobe_match(const char *system, const char *event, -			       struct dyn_event *ev) +			int argc, const char **argv, struct dyn_event *ev)  {  	struct trace_uprobe *tu = to_trace_uprobe(ev);  	return strcmp(trace_probe_name(&tu->tp), event) == 0 && -	    (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); +	   (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && +	   trace_uprobe_match_command_head(tu, argc, argv); +} + +static nokprobe_inline struct trace_uprobe * +trace_uprobe_primary_from_call(struct trace_event_call *call) +{ +	struct trace_probe *tp; + +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return NULL; + +	return container_of(tp, struct trace_uprobe, tp);  }  /* @@ -352,15 +396,76 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)  {  	int ret; +	if (trace_probe_has_sibling(&tu->tp)) +		goto unreg; +  	ret = unregister_uprobe_event(tu);  	if (ret)  		return ret; +unreg:  	dyn_event_remove(&tu->devent); +	trace_probe_unlink(&tu->tp);  	free_trace_uprobe(tu);  	return 0;  } +static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, +					 struct trace_uprobe *comp) +{ +	struct trace_probe_event *tpe = orig->tp.event; +	struct trace_probe *pos; +	struct inode *comp_inode = d_real_inode(comp->path.dentry); +	int i; + +	list_for_each_entry(pos, &tpe->probes, list) { +		orig = container_of(pos, struct trace_uprobe, tp); +		if (comp_inode != d_real_inode(orig->path.dentry) || +		    comp->offset != orig->offset) +			continue; + +		/* +		 * trace_probe_compare_arg_type() ensured that nr_args and +		 * each argument name and type are same. Let's compare comm. +		 */ +		for (i = 0; i < orig->tp.nr_args; i++) { +			if (strcmp(orig->tp.args[i].comm, +				   comp->tp.args[i].comm)) +				break; +		} + +		if (i == orig->tp.nr_args) +			return true; +	} + +	return false; +} + +static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) +{ +	int ret; + +	ret = trace_probe_compare_arg_type(&tu->tp, &to->tp); +	if (ret) { +		/* Note that argument starts index = 2 */ +		trace_probe_log_set_index(ret + 1); +		trace_probe_log_err(0, DIFF_ARG_TYPE); +		return -EEXIST; +	} +	if (trace_uprobe_has_same_uprobe(to, tu)) { +		trace_probe_log_set_index(0); +		trace_probe_log_err(0, SAME_PROBE); +		return -EEXIST; +	} + +	/* Append to existing event */ +	ret = trace_probe_append(&tu->tp, &to->tp); +	if (!ret) +		dyn_event_add(&tu->devent); + +	return ret; +} +  /*   * Uprobe with multiple reference counter is not allowed. i.e.   * If inode and offset matches, reference counter offset *must* @@ -370,25 +475,21 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)   * as the new one does not conflict with any other existing   * ones.   */ -static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +static int validate_ref_ctr_offset(struct trace_uprobe *new)  {  	struct dyn_event *pos; -	struct trace_uprobe *tmp, *old = NULL; +	struct trace_uprobe *tmp;  	struct inode *new_inode = d_real_inode(new->path.dentry); -	old = find_probe_event(trace_probe_name(&new->tp), -				trace_probe_group_name(&new->tp)); -  	for_each_trace_uprobe(tmp, pos) { -		if ((old ? old != tmp : true) && -		    new_inode == d_real_inode(tmp->path.dentry) && +		if (new_inode == d_real_inode(tmp->path.dentry) &&  		    new->offset == tmp->offset &&  		    new->ref_ctr_offset != tmp->ref_ctr_offset) {  			pr_warn("Reference counter offset mismatch."); -			return ERR_PTR(-EINVAL); +			return -EINVAL;  		}  	} -	return old; +	return 0;  }  /* Register a trace_uprobe and probe_event */ @@ -399,18 +500,22 @@ static int register_trace_uprobe(struct trace_uprobe *tu)  	mutex_lock(&event_mutex); -	/* register as an event */ -	old_tu = find_old_trace_uprobe(tu); -	if (IS_ERR(old_tu)) { -		ret = PTR_ERR(old_tu); +	ret = validate_ref_ctr_offset(tu); +	if (ret)  		goto end; -	} +	/* register as an event */ +	old_tu = find_probe_event(trace_probe_name(&tu->tp), +				  trace_probe_group_name(&tu->tp));  	if (old_tu) { -		/* delete old event */ -		ret = unregister_trace_uprobe(old_tu); -		if (ret) -			goto end; +		if (is_ret_probe(tu) != is_ret_probe(old_tu)) { +			trace_probe_log_set_index(0); +			trace_probe_log_err(0, DIFF_PROBE_TYPE); +			ret = -EEXIST; +		} else { +			ret = append_trace_uprobe(tu, old_tu); +		} +		goto end;  	}  	ret = register_uprobe_event(tu); @@ -897,7 +1002,10 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e  	u8 *data;  	entry = (struct uprobe_trace_entry_head *)iter->ent; -	tu = container_of(event, struct trace_uprobe, tp.call.event); +	tu = trace_uprobe_primary_from_call( +		container_of(event, struct trace_event_call, event)); +	if (unlikely(!tu)) +		goto out;  	if (is_ret_probe(tu)) {  		trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", @@ -924,27 +1032,71 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,  				enum uprobe_filter_ctx ctx,  				struct mm_struct *mm); -static int -probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, -		   filter_func_t filter) +static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)  { -	bool enabled = trace_probe_is_enabled(&tu->tp);  	int ret; +	tu->consumer.filter = filter; +	tu->inode = d_real_inode(tu->path.dentry); + +	if (tu->ref_ctr_offset) +		ret = uprobe_register_refctr(tu->inode, tu->offset, +				tu->ref_ctr_offset, &tu->consumer); +	else +		ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + +	if (ret) +		tu->inode = NULL; + +	return ret; +} + +static void __probe_event_disable(struct trace_probe *tp) +{ +	struct trace_probe *pos; +	struct trace_uprobe *tu; + +	list_for_each_entry(pos, trace_probe_probe_list(tp), list) { +		tu = container_of(pos, struct trace_uprobe, tp); +		if (!tu->inode) +			continue; + +		WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + +		uprobe_unregister(tu->inode, tu->offset, &tu->consumer); +		tu->inode = NULL; +	} +} + +static int probe_event_enable(struct trace_event_call *call, +			struct trace_event_file *file, filter_func_t filter) +{ +	struct trace_probe *pos, *tp; +	struct trace_uprobe *tu; +	bool enabled; +	int ret; + +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return -ENODEV; +	enabled = trace_probe_is_enabled(tp); + +	/* This may also change "enabled" state */  	if (file) { -		if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) +		if (trace_probe_test_flag(tp, TP_FLAG_PROFILE))  			return -EINTR; -		ret = trace_probe_add_file(&tu->tp, file); +		ret = trace_probe_add_file(tp, file);  		if (ret < 0)  			return ret;  	} else { -		if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) +		if (trace_probe_test_flag(tp, TP_FLAG_TRACE))  			return -EINTR; -		trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE); +		trace_probe_set_flag(tp, TP_FLAG_PROFILE);  	} +	tu = container_of(tp, struct trace_uprobe, tp);  	WARN_ON(!uprobe_filter_is_empty(&tu->filter));  	if (enabled) @@ -954,18 +1106,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,  	if (ret)  		goto err_flags; -	tu->consumer.filter = filter; -	tu->inode = d_real_inode(tu->path.dentry); -	if (tu->ref_ctr_offset) { -		ret = uprobe_register_refctr(tu->inode, tu->offset, -				tu->ref_ctr_offset, &tu->consumer); -	} else { -		ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); +	list_for_each_entry(pos, trace_probe_probe_list(tp), list) { +		tu = container_of(pos, struct trace_uprobe, tp); +		ret = trace_uprobe_enable(tu, filter); +		if (ret) { +			__probe_event_disable(tp); +			goto err_buffer; +		}  	} -	if (ret) -		goto err_buffer; -  	return 0;   err_buffer: @@ -973,33 +1122,35 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,   err_flags:  	if (file) -		trace_probe_remove_file(&tu->tp, file); +		trace_probe_remove_file(tp, file);  	else -		trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); +		trace_probe_clear_flag(tp, TP_FLAG_PROFILE);  	return ret;  } -static void -probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) +static void probe_event_disable(struct trace_event_call *call, +				struct trace_event_file *file)  { -	if (!trace_probe_is_enabled(&tu->tp)) +	struct trace_probe *tp; + +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return; + +	if (!trace_probe_is_enabled(tp))  		return;  	if (file) { -		if (trace_probe_remove_file(&tu->tp, file) < 0) +		if (trace_probe_remove_file(tp, file) < 0)  			return; -		if (trace_probe_is_enabled(&tu->tp)) +		if (trace_probe_is_enabled(tp))  			return;  	} else -		trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); - -	WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - -	uprobe_unregister(tu->inode, tu->offset, &tu->consumer); -	tu->inode = NULL; +		trace_probe_clear_flag(tp, TP_FLAG_PROFILE); +	__probe_event_disable(tp);  	uprobe_buffer_disable();  } @@ -1007,7 +1158,11 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call)  {  	int ret, size;  	struct uprobe_trace_entry_head field; -	struct trace_uprobe *tu = event_call->data; +	struct trace_uprobe *tu; + +	tu = trace_uprobe_primary_from_call(event_call); +	if (unlikely(!tu)) +		return -ENODEV;  	if (is_ret_probe(tu)) {  		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); @@ -1100,6 +1255,27 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)  	return err;  } +static int uprobe_perf_multi_call(struct trace_event_call *call, +				  struct perf_event *event, +		int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +{ +	struct trace_probe *pos, *tp; +	struct trace_uprobe *tu; +	int ret = 0; + +	tp = trace_probe_primary_from_call(call); +	if (WARN_ON_ONCE(!tp)) +		return -ENODEV; + +	list_for_each_entry(pos, trace_probe_probe_list(tp), list) { +		tu = container_of(pos, struct trace_uprobe, tp); +		ret = op(tu, event); +		if (ret) +			break; +	} + +	return ret; +}  static bool uprobe_perf_filter(struct uprobe_consumer *uc,  				enum uprobe_filter_ctx ctx, struct mm_struct *mm)  { @@ -1213,30 +1389,29 @@ static int  trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,  		      void *data)  { -	struct trace_uprobe *tu = event->data;  	struct trace_event_file *file = data;  	switch (type) {  	case TRACE_REG_REGISTER: -		return probe_event_enable(tu, file, NULL); +		return probe_event_enable(event, file, NULL);  	case TRACE_REG_UNREGISTER: -		probe_event_disable(tu, file); +		probe_event_disable(event, file);  		return 0;  #ifdef CONFIG_PERF_EVENTS  	case TRACE_REG_PERF_REGISTER: -		return probe_event_enable(tu, NULL, uprobe_perf_filter); +		return probe_event_enable(event, NULL, uprobe_perf_filter);  	case TRACE_REG_PERF_UNREGISTER: -		probe_event_disable(tu, NULL); +		probe_event_disable(event, NULL);  		return 0;  	case TRACE_REG_PERF_OPEN: -		return uprobe_perf_open(tu, data); +		return uprobe_perf_multi_call(event, data, uprobe_perf_open);  	case TRACE_REG_PERF_CLOSE: -		return uprobe_perf_close(tu, data); +		return uprobe_perf_multi_call(event, data, uprobe_perf_close);  #endif  	default: @@ -1330,7 +1505,6 @@ static inline void init_trace_event_call(struct trace_uprobe *tu)  	call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;  	call->class->reg = trace_uprobe_register; -	call->data = tu;  }  static int register_uprobe_event(struct trace_uprobe *tu) @@ -1399,7 +1573,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)  {  	struct trace_uprobe *tu; -	tu = container_of(event_call, struct trace_uprobe, tp.call); +	tu = trace_uprobe_primary_from_call(event_call);  	free_trace_uprobe(tu);  } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9e7b9306fe..f41334ef0971 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -490,10 +490,10 @@ static void watchdog_enable(unsigned int cpu)  	 * Start the timer first to prevent the NMI watchdog triggering  	 * before the timer has a chance to fire.  	 */ -	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  	hrtimer->function = watchdog_timer_fn;  	hrtimer_start(hrtimer, ns_to_ktime(sample_period), -		      HRTIMER_MODE_REL_PINNED); +		      HRTIMER_MODE_REL_PINNED_HARD);  	/* Initialize timestamp */  	__touch_watchdog(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 601d61150b65..bc2e09a8ea61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);   *   * Undo alloc_workqueue_attrs().   */ -static void free_workqueue_attrs(struct workqueue_attrs *attrs) +void free_workqueue_attrs(struct workqueue_attrs *attrs)  {  	if (attrs) {  		free_cpumask_var(attrs->cpumask); @@ -3345,7 +3345,7 @@ static void free_workqueue_attrs(struct workqueue_attrs *attrs)   *   * Return: The allocated new workqueue_attr on success. %NULL on failure.   */ -static struct workqueue_attrs *alloc_workqueue_attrs(void) +struct workqueue_attrs *alloc_workqueue_attrs(void)  {  	struct workqueue_attrs *attrs; @@ -4030,16 +4030,20 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,   *   * Performs GFP_KERNEL allocations.   * + * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + *   * Return: 0 on success and -errno on failure.   */ -static int apply_workqueue_attrs(struct workqueue_struct *wq, +int apply_workqueue_attrs(struct workqueue_struct *wq,  			  const struct workqueue_attrs *attrs)  {  	int ret; -	apply_wqattrs_lock(); +	lockdep_assert_cpus_held(); + +	mutex_lock(&wq_pool_mutex);  	ret = apply_workqueue_attrs_locked(wq, attrs); -	apply_wqattrs_unlock(); +	mutex_unlock(&wq_pool_mutex);  	return ret;  } @@ -4152,16 +4156,21 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)  			mutex_unlock(&wq->mutex);  		}  		return 0; -	} else if (wq->flags & __WQ_ORDERED) { +	} + +	get_online_cpus(); +	if (wq->flags & __WQ_ORDERED) {  		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);  		/* there should only be single pwq for ordering guarantee */  		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||  			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),  		     "ordering guarantee broken for workqueue %s\n", wq->name); -		return ret;  	} else { -		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); +		ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);  	} +	put_online_cpus(); + +	return ret;  }  static int wq_clamp_max_active(int max_active, unsigned int flags,  | 

