diff options
Diffstat (limited to 'kernel')
251 files changed, 9838 insertions, 4827 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 6e699100872f..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,5 +1,6 @@  #  # Generated files  # +kheaders.md5  timeconst.h  hz.bc diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer index a3bb4cb52539..68646feefb3d 100644 --- a/kernel/Kconfig.freezer +++ b/kernel/Kconfig.freezer @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only  config FREEZER  	def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a846757..38ef6d06888e 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # Timer Interrupt Frequency Configuration  # diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index fbba478ae522..e0852dc333ac 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # The ARCH_INLINE foo is necessary because select ignores "depends on"  # @@ -229,7 +230,7 @@ config MUTEX_SPIN_ON_OWNER  config RWSEM_SPIN_ON_OWNER         def_bool y -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW  config LOCK_SPIN_ON_OWNER         def_bool y @@ -251,3 +252,10 @@ config ARCH_USE_QUEUED_RWLOCKS  config QUEUED_RWLOCKS  	def_bool y if ARCH_USE_QUEUED_RWLOCKS  	depends on SMP + +config ARCH_HAS_MMIOWB +	bool + +config MMIOWB +	def_bool y if ARCH_HAS_MMIOWB +	depends on SMP diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0fee5fe6c899..dc0b682ec2d9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  choice  	prompt "Preemption Model" diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..33824f0385b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n  # Don't self-instrument.  KCOV_INSTRUMENT_kcov.o := n  KASAN_SANITIZE_kcov.o := n +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)  # cond_syscall is currently not LTO compatible  CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o  obj-$(CONFIG_USER_NS) += user_namespace.o  obj-$(CONFIG_PID_NS) += pid_namespace.o  obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o  obj-$(CONFIG_SMP) += stop_machine.o  obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o  obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -121,3 +123,12 @@ $(obj)/configs.o: $(obj)/config_data.gz  targets += config_data.gz  $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE  	$(call if_changed,gzip) + +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz + +quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE +	$(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)  		filp_close(file, NULL);  		return PTR_ERR(internal);  	} -	err = mnt_want_write(internal); +	err = __mnt_want_write(internal);  	if (err) {  		mntput(internal);  		kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)  	old = xchg(&ns->bacct, &acct->pin);  	mutex_unlock(&acct->lock);  	pin_kill(old); -	mnt_drop_write(mnt); +	__mnt_drop_write(mnt);  	mntput(mnt);  	return 0;  } diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work)  	/* 1) run (and print duration) */  	if (initcall_debug && system_state < SYSTEM_RUNNING) { -		pr_debug("calling  %lli_%pF @ %i\n", +		pr_debug("calling  %lli_%pS @ %i\n",  			(long long)entry->cookie,  			entry->func, task_pid_nr(current));  		calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work)  	if (initcall_debug && system_state < SYSTEM_RUNNING) {  		rettime = ktime_get();  		delta = ktime_sub(rettime, calltime); -		pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", +		pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",  			(long long)entry->cookie,  			entry->func,  			(long long)ktime_to_ns(delta) >> 10); diff --git a/kernel/audit.c b/kernel/audit.c index c89ea48c70a6..486c968214d9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* audit.c -- Auditing support   * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.   * System-call specific features have moved to auditsc.c @@ -5,20 +6,6 @@   * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.   * All Rights Reserved.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA - *   * Written by Rickard E. (Rik) Faith <faith@redhat.com>   *   * Goals: 1) Integrate fully with Security Modules. @@ -2220,7 +2207,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,  	if (!audit_enabled)  		return; -	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); +	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);  	if (!ab)  		return; diff --git a/kernel/audit.h b/kernel/audit.h index 958d5b8fc1b3..6c076d4982da 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -1,22 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */  /* audit -- definition of audit_context structure and supporting types    *   * Copyright 2003-2004 Red Hat, Inc.   * Copyright 2005 Hewlett-Packard Development Company, L.P.   * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA   */  #include <linux/fs.h> @@ -231,7 +218,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right);  extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);  extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);  extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path, int plen); +extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);  extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,  					const void *payload, int size);  extern void		    audit_panic(const char *message); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 37ae95cfb7f4..f0d243318452 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* audit_fsnotify.c -- tracking inodes   *   * Copyright 2003-2009,2014-2015 Red Hat, Inc.   * Copyright 2005 Hewlett-Packard Development Company, L.P.   * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details.   */  #include <linux/kernel.h> @@ -164,7 +155,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)  static int audit_mark_handle_event(struct fsnotify_group *group,  				    struct inode *to_tell,  				    u32 mask, const void *data, int data_type, -				    const unsigned char *dname, u32 cookie, +				    const struct qstr *dname, u32 cookie,  				    struct fsnotify_iter_info *iter_info)  {  	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index abfb112f26aa..e49c912f862d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1040,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk)  static int audit_tree_handle_event(struct fsnotify_group *group,  				   struct inode *to_tell,  				   u32 mask, const void *data, int data_type, -				   const unsigned char *file_name, u32 cookie, +				   const struct qstr *file_name, u32 cookie,  				   struct fsnotify_iter_info *iter_info)  {  	return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e8d1adeb2223..1f31c2f1e6fc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* audit_watch.c -- watching inodes   *   * Copyright 2003-2009 Red Hat, Inc.   * Copyright 2005 Hewlett-Packard Development Company, L.P.   * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA   */  #include <linux/file.h> @@ -255,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc  /* Update inode info in audit rules based on filesystem event. */  static void audit_update_watch(struct audit_parent *parent, -			       const char *dname, dev_t dev, +			       const struct qstr *dname, dev_t dev,  			       unsigned long ino, unsigned invalidating)  {  	struct audit_watch *owatch, *nwatch, *nextw; @@ -482,7 +469,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)  static int audit_watch_handle_event(struct fsnotify_group *group,  				    struct inode *to_tell,  				    u32 mask, const void *data, int data_type, -				    const unsigned char *dname, u32 cookie, +				    const struct qstr *dname, u32 cookie,  				    struct fsnotify_iter_info *iter_info)  {  	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 63f8b3f26fab..9f8e190e3bea 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* auditfilter.c -- filtering of audit events   *   * Copyright 2003-2004 Red Hat, Inc.   * Copyright 2005 Hewlett-Packard Development Company, L.P.   * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -1114,22 +1101,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)  	int err = 0;  	struct audit_entry *entry; -	entry = audit_data_to_entry(data, datasz); -	if (IS_ERR(entry)) -		return PTR_ERR(entry); -  	switch (type) {  	case AUDIT_ADD_RULE: +		entry = audit_data_to_entry(data, datasz); +		if (IS_ERR(entry)) +			return PTR_ERR(entry);  		err = audit_add_rule(entry);  		audit_log_rule_change("add_rule", &entry->rule, !err);  		break;  	case AUDIT_DEL_RULE: +		entry = audit_data_to_entry(data, datasz); +		if (IS_ERR(entry)) +			return PTR_ERR(entry);  		err = audit_del_rule(entry);  		audit_log_rule_change("remove_rule", &entry->rule, !err);  		break;  	default: -		err = -EINVAL;  		WARN_ON(1); +		return -EINVAL;  	}  	if (err || type == AUDIT_DEL_RULE) { @@ -1290,12 +1279,12 @@ int parent_len(const char *path)   * @parentlen:	length of the parent if known. Passing in AUDIT_NAME_FULL   * 		here indicates that we must compute this value.   */ -int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)  {  	int dlen, pathlen;  	const char *p; -	dlen = strlen(dname); +	dlen = dname->len;  	pathlen = strlen(path);  	if (pathlen < dlen)  		return 1; @@ -1306,7 +1295,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)  	p = path + parentlen; -	return strncmp(p, dname, dlen); +	return strncmp(p, dname->name, dlen);  }  int audit_filter(int msgtype, unsigned int listtype) @@ -1315,8 +1304,6 @@ int audit_filter(int msgtype, unsigned int listtype)  	int ret = 1; /* Audit by default */  	rcu_read_lock(); -	if (list_empty(&audit_filter_list[listtype])) -		goto unlock_and_return;  	list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {  		int i, result = 0; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..95ae27edd417 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -771,15 +771,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,  		return AUDIT_DISABLED;  	rcu_read_lock(); -	if (!list_empty(list)) { -		list_for_each_entry_rcu(e, list, list) { -			if (audit_in_mask(&e->rule, ctx->major) && -			    audit_filter_rules(tsk, &e->rule, ctx, NULL, -					       &state, false)) { -				rcu_read_unlock(); -				ctx->current_state = state; -				return state; -			} +	list_for_each_entry_rcu(e, list, list) { +		if (audit_in_mask(&e->rule, ctx->major) && +		    audit_filter_rules(tsk, &e->rule, ctx, NULL, +				       &state, false)) { +			rcu_read_unlock(); +			ctx->current_state = state; +			return state;  		}  	}  	rcu_read_unlock(); @@ -798,9 +796,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,  	struct audit_entry *e;  	enum audit_state state; -	if (list_empty(list)) -		return 0; -  	list_for_each_entry_rcu(e, list, list) {  		if (audit_in_mask(&e->rule, ctx->major) &&  		    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { @@ -808,7 +803,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,  			return 1;  		}  	} -  	return 0;  } @@ -840,6 +834,13 @@ static inline void audit_proctitle_free(struct audit_context *context)  	context->proctitle.len = 0;  } +static inline void audit_free_module(struct audit_context *context) +{ +	if (context->type == AUDIT_KERN_MODULE) { +		kfree(context->module.name); +		context->module.name = NULL; +	} +}  static inline void audit_free_names(struct audit_context *context)  {  	struct audit_names *n, *next; @@ -923,6 +924,7 @@ int audit_alloc(struct task_struct *tsk)  static inline void audit_free_context(struct audit_context *context)  { +	audit_free_module(context);  	audit_free_names(context);  	unroll_tree_refs(context, NULL, 0);  	free_tree_refs(context); @@ -1139,7 +1141,8 @@ out:  	kfree(buf_head);  } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +static void audit_log_cap(struct audit_buffer *ab, char *prefix, +			  kernel_cap_t *cap)  {  	int i; @@ -1266,7 +1269,6 @@ static void show_special(struct audit_context *context, int *call_panic)  		audit_log_format(ab, "name=");  		if (context->module.name) {  			audit_log_untrustedstring(ab, context->module.name); -			kfree(context->module.name);  		} else  			audit_log_format(ab, "(null)"); @@ -1628,7 +1630,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,  			return;  	} -	context->arch	    = syscall_get_arch(); +	context->arch	    = syscall_get_arch(current);  	context->major      = major;  	context->argv[0]    = a1;  	context->argv[1]    = a2; @@ -1697,6 +1699,7 @@ void __audit_syscall_exit(int success, long return_code)  	context->in_syscall = 0;  	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; +	audit_free_module(context);  	audit_free_names(context);  	unroll_tree_refs(context, NULL, 0);  	audit_free_aux(context); @@ -1897,8 +1900,9 @@ static inline int audit_copy_fcaps(struct audit_names *name,  }  /* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, -		      struct inode *inode, unsigned int flags) +static void audit_copy_inode(struct audit_names *name, +			     const struct dentry *dentry, +			     struct inode *inode, unsigned int flags)  {  	name->ino   = inode->i_ino;  	name->dev   = inode->i_sb->s_dev; @@ -1935,18 +1939,16 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,  		return;  	rcu_read_lock(); -	if (!list_empty(list)) { -		list_for_each_entry_rcu(e, list, list) { -			for (i = 0; i < e->rule.field_count; i++) { -				struct audit_field *f = &e->rule.fields[i]; - -				if (f->type == AUDIT_FSTYPE -				    && audit_comparator(inode->i_sb->s_magic, -							f->op, f->val) -				    && e->rule.action == AUDIT_NEVER) { -					rcu_read_unlock(); -					return; -				} +	list_for_each_entry_rcu(e, list, list) { +		for (i = 0; i < e->rule.field_count; i++) { +			struct audit_field *f = &e->rule.fields[i]; + +			if (f->type == AUDIT_FSTYPE +			    && audit_comparator(inode->i_sb->s_magic, +						f->op, f->val) +			    && e->rule.action == AUDIT_NEVER) { +				rcu_read_unlock(); +				return;  			}  		}  	} @@ -2045,7 +2047,7 @@ void __audit_inode_child(struct inode *parent,  {  	struct audit_context *context = audit_context();  	struct inode *inode = d_backing_inode(dentry); -	const char *dname = dentry->d_name.name; +	const struct qstr *dname = &dentry->d_name;  	struct audit_names *n, *found_parent = NULL, *found_child = NULL;  	struct audit_entry *e;  	struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; @@ -2055,18 +2057,16 @@ void __audit_inode_child(struct inode *parent,  		return;  	rcu_read_lock(); -	if (!list_empty(list)) { -		list_for_each_entry_rcu(e, list, list) { -			for (i = 0; i < e->rule.field_count; i++) { -				struct audit_field *f = &e->rule.fields[i]; - -				if (f->type == AUDIT_FSTYPE -				    && audit_comparator(parent->i_sb->s_magic, -							f->op, f->val) -				    && e->rule.action == AUDIT_NEVER) { -					rcu_read_unlock(); -					return; -				} +	list_for_each_entry_rcu(e, list, list) { +		for (i = 0; i < e->rule.field_count; i++) { +			struct audit_field *f = &e->rule.fields[i]; + +			if (f->type == AUDIT_FSTYPE +			    && audit_comparator(parent->i_sb->s_magic, +						f->op, f->val) +			    && e->rule.action == AUDIT_NEVER) { +				rcu_read_unlock(); +				return;  			}  		}  	} @@ -2099,7 +2099,7 @@ void __audit_inode_child(struct inode *parent,  		    (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))  			continue; -		if (!strcmp(dname, n->name->name) || +		if (!strcmp(dname->name, n->name->name) ||  		    !audit_compare_dname_path(dname, n->name->name,  						found_parent ?  						found_parent->name_len : @@ -2512,6 +2512,35 @@ void __audit_fanotify(unsigned int response)  		AUDIT_FANOTIFY,	"resp=%u", response);  } +void __audit_tk_injoffset(struct timespec64 offset) +{ +	audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, +		  "sec=%lli nsec=%li", +		  (long long)offset.tv_sec, offset.tv_nsec); +} + +static void audit_log_ntp_val(const struct audit_ntp_data *ad, +			      const char *op, enum audit_ntp_type type) +{ +	const struct audit_ntp_val *val = &ad->vals[type]; + +	if (val->newval == val->oldval) +		return; + +	audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, +		  "op=%s old=%lli new=%lli", op, val->oldval, val->newval); +} + +void __audit_ntp_log(const struct audit_ntp_data *ad) +{ +	audit_log_ntp_val(ad, "offset",	AUDIT_NTP_OFFSET); +	audit_log_ntp_val(ad, "freq",	AUDIT_NTP_FREQ); +	audit_log_ntp_val(ad, "status",	AUDIT_NTP_STATUS); +	audit_log_ntp_val(ad, "tai",	AUDIT_NTP_TAI); +	audit_log_ntp_val(ad, "tick",	AUDIT_NTP_TICK); +	audit_log_ntp_val(ad, "adjust",	AUDIT_NTP_ADJUST); +} +  static void audit_log_task(struct audit_buffer *ab)  {  	kuid_t auid, uid; @@ -2580,7 +2609,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code)  		return;  	audit_log_task(ab);  	audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", -			 signr, syscall_get_arch(), syscall, +			 signr, syscall_get_arch(current), syscall,  			 in_compat_syscall(), KSTK_EIP(current), code);  	audit_log_end(ab);  } diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 1323360d90e3..a563c8fdad0d 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -48,19 +48,14 @@ static void backtrace_test_irq(void)  #ifdef CONFIG_STACKTRACE  static void backtrace_test_saved(void)  { -	struct stack_trace trace;  	unsigned long entries[8]; +	unsigned int nr_entries;  	pr_info("Testing a saved backtrace.\n");  	pr_info("The following trace is a kernel self test and not a bug!\n"); -	trace.nr_entries = 0; -	trace.max_entries = ARRAY_SIZE(entries); -	trace.entries = entries; -	trace.skip = 0; - -	save_stack_trace(&trace); -	print_stack_trace(&trace, 0); +	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); +	stack_trace_print(entries, nr_entries, 0);  }  #else  static void backtrace_test_saved(void) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c72e0d8e1e65..584636c9e2eb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@  #include "map_in_map.h"  #define ARRAY_CREATE_FLAG_MASK \ -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)  static void bpf_array_free_percpu(struct bpf_array *array)  { @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr)  	if (attr->max_entries == 0 || attr->key_size != 4 ||  	    attr->value_size == 0 ||  	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || +	    !bpf_map_flags_access_ok(attr->map_flags) ||  	    (percpu && numa_node != NUMA_NO_NODE))  		return -EINVAL; @@ -160,6 +161,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)  	return array->value + array->elem_size * (index & array->index_mask);  } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, +				       u32 off) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); + +	if (map->max_entries != 1) +		return -ENOTSUPP; +	if (off >= map->value_size) +		return -EINVAL; + +	*imm = (unsigned long)array->value; +	return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, +				       u32 *off) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); +	u64 base = (unsigned long)array->value; +	u64 range = array->elem_size; + +	if (map->max_entries != 1) +		return -ENOTSUPP; +	if (imm < base || imm >= base + range) +		return -ENOENT; + +	*off = imm - base; +	return 0; +} +  /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */  static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  { @@ -360,7 +391,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,  		return;  	} -	seq_printf(m, "%u: ", *(u32 *)key); +	if (map->btf_key_type_id) +		seq_printf(m, "%u: ", *(u32 *)key);  	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);  	seq_puts(m, "\n"); @@ -397,6 +429,18 @@ static int array_map_check_btf(const struct bpf_map *map,  {  	u32 int_data; +	/* One exception for keyless BTF: .bss/.data/.rodata map */ +	if (btf_type_is_void(key_type)) { +		if (map->map_type != BPF_MAP_TYPE_ARRAY || +		    map->max_entries != 1) +			return -EINVAL; + +		if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) +			return -EINVAL; + +		return 0; +	} +  	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)  		return -EINVAL; @@ -419,6 +463,8 @@ const struct bpf_map_ops array_map_ops = {  	.map_update_elem = array_map_update_elem,  	.map_delete_elem = array_map_delete_elem,  	.map_gen_lookup = array_map_gen_lookup, +	.map_direct_value_addr = array_map_direct_value_addr, +	.map_direct_value_meta = array_map_direct_value_meta,  	.map_seq_show_elem = array_map_seq_show_elem,  	.map_check_btf = array_map_check_btf,  }; @@ -440,6 +486,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr)  	/* only file descriptors can be stored in this type of map */  	if (attr->value_size != sizeof(u32))  		return -EINVAL; +	/* Program read-only/write-only not supported for special maps yet. */ +	if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) +		return -EINVAL;  	return array_map_alloc_check(attr);  } diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e6ef4401a138..1b6b9349cb85 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only  /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #include <linux/cpumask.h>  #include <linux/spinlock.h> diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 7d4f89b7cb84..f02504640e18 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */  /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #ifndef __BPF_LRU_LIST_H_  #define __BPF_LRU_LIST_H_ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd3921b1514b..cad09858a5f2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -185,6 +185,16 @@  	     i < btf_type_vlen(struct_type);				\  	     i++, member++) +#define for_each_vsi(i, struct_type, member)			\ +	for (i = 0, member = btf_type_var_secinfo(struct_type);	\ +	     i < btf_type_vlen(struct_type);			\ +	     i++, member++) + +#define for_each_vsi_from(i, from, struct_type, member)				\ +	for (i = from, member = btf_type_var_secinfo(struct_type) + from;	\ +	     i < btf_type_vlen(struct_type);					\ +	     i++, member++) +  static DEFINE_IDR(btf_idr);  static DEFINE_SPINLOCK(btf_idr_lock); @@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {  	[BTF_KIND_RESTRICT]	= "RESTRICT",  	[BTF_KIND_FUNC]		= "FUNC",  	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO", +	[BTF_KIND_VAR]		= "VAR", +	[BTF_KIND_DATASEC]	= "DATASEC",  };  struct btf_kind_operations { @@ -314,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)  	return false;  } -static bool btf_type_is_void(const struct btf_type *t) +bool btf_type_is_void(const struct btf_type *t)  {  	return t == &btf_void;  } @@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t)  	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;  } +static bool btf_type_is_var(const struct btf_type *t) +{ +	return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +static bool btf_type_is_datasec(const struct btf_type *t) +{ +	return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +} + +/* Types that act only as a source, not sink or intermediate + * type when resolving. + */ +static bool btf_type_is_resolve_source_only(const struct btf_type *t) +{ +	return btf_type_is_var(t) || +	       btf_type_is_datasec(t); +} +  /* What types need to be resolved?   *   * btf_type_is_modifier() is an obvious one.   *   * btf_type_is_struct() because its member refers to   * another type (through member->type). - + * + * btf_type_is_var() because the variable refers to + * another type. btf_type_is_datasec() holds multiple + * btf_type_is_var() types that need resolving. + *   * btf_type_is_array() because its element (array->type)   * refers to another type.  Array can be thought of a   * special case of struct while array just has the same @@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t)  static bool btf_type_needs_resolve(const struct btf_type *t)  {  	return btf_type_is_modifier(t) || -		btf_type_is_ptr(t) || -		btf_type_is_struct(t) || -		btf_type_is_array(t); +	       btf_type_is_ptr(t) || +	       btf_type_is_struct(t) || +	       btf_type_is_array(t) || +	       btf_type_is_var(t) || +	       btf_type_is_datasec(t);  }  /* t->size can be used */ @@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t)  	case BTF_KIND_STRUCT:  	case BTF_KIND_UNION:  	case BTF_KIND_ENUM: +	case BTF_KIND_DATASEC:  		return true;  	} @@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t)  	return (const struct btf_enum *)(t + 1);  } +static const struct btf_var *btf_type_var(const struct btf_type *t) +{ +	return (const struct btf_var *)(t + 1); +} + +static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +{ +	return (const struct btf_var_secinfo *)(t + 1); +} +  static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)  {  	return kind_ops[BTF_INFO_KIND(t->info)]; @@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)  		offset < btf->hdr.str_len;  } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +{ +	if ((first ? !isalpha(c) : +		     !isalnum(c)) && +	    c != '_' && +	    ((c == '.' && !dot_ok) || +	      c != '.')) +		return false; +	return true; +} + +static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)  {  	/* offset must be valid */  	const char *src = &btf->strings[offset];  	const char *src_limit; -	if (!isalpha(*src) && *src != '_') +	if (!__btf_name_char_ok(*src, true, dot_ok))  		return false;  	/* set a limit on identifier length */  	src_limit = src + KSYM_NAME_LEN;  	src++;  	while (*src && src < src_limit) { -		if (!isalnum(*src) && *src != '_') +		if (!__btf_name_char_ok(*src, false, dot_ok))  			return false;  		src++;  	} @@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)  	return !*src;  } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ +	return __btf_name_valid(btf, offset, false); +} + +static bool btf_name_valid_section(const struct btf *btf, u32 offset) +{ +	return __btf_name_valid(btf, offset, true); +} +  static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)  {  	if (!offset) @@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,  	__btf_verifier_log(log, "\n");  } +__printf(4, 5) +static void btf_verifier_log_vsi(struct btf_verifier_env *env, +				 const struct btf_type *datasec_type, +				 const struct btf_var_secinfo *vsi, +				 const char *fmt, ...) +{ +	struct bpf_verifier_log *log = &env->log; +	va_list args; + +	if (!bpf_verifier_log_needed(log)) +		return; +	if (env->phase != CHECK_META) +		btf_verifier_log_type(env, datasec_type, NULL); + +	__btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", +			   vsi->type, vsi->offset, vsi->size); +	if (fmt && *fmt) { +		__btf_verifier_log(log, " "); +		va_start(args, fmt); +		bpf_verifier_vlog(log, fmt, args); +		va_end(args); +	} + +	__btf_verifier_log(log, "\n"); +} +  static void btf_verifier_log_hdr(struct btf_verifier_env *env,  				 u32 btf_data_size)  { @@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,  	} else if (btf_type_is_ptr(size_type)) {  		size = sizeof(void *);  	} else { -		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) +		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && +				 !btf_type_is_var(size_type)))  			return NULL;  		size = btf->resolved_sizes[size_type_id]; @@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,  	u32 next_type_size = 0;  	next_type = btf_type_by_id(btf, next_type_id); -	if (!next_type) { +	if (!next_type || btf_type_is_resolve_source_only(next_type)) {  		btf_verifier_log_type(env, v->t, "Invalid type_id");  		return -EINVAL;  	} @@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,  	return 0;  } +static int btf_var_resolve(struct btf_verifier_env *env, +			   const struct resolve_vertex *v) +{ +	const struct btf_type *next_type; +	const struct btf_type *t = v->t; +	u32 next_type_id = t->type; +	struct btf *btf = env->btf; +	u32 next_type_size; + +	next_type = btf_type_by_id(btf, next_type_id); +	if (!next_type || btf_type_is_resolve_source_only(next_type)) { +		btf_verifier_log_type(env, v->t, "Invalid type_id"); +		return -EINVAL; +	} + +	if (!env_type_is_resolve_sink(env, next_type) && +	    !env_type_is_resolved(env, next_type_id)) +		return env_stack_push(env, next_type, next_type_id); + +	if (btf_type_is_modifier(next_type)) { +		const struct btf_type *resolved_type; +		u32 resolved_type_id; + +		resolved_type_id = next_type_id; +		resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + +		if (btf_type_is_ptr(resolved_type) && +		    !env_type_is_resolve_sink(env, resolved_type) && +		    !env_type_is_resolved(env, resolved_type_id)) +			return env_stack_push(env, resolved_type, +					      resolved_type_id); +	} + +	/* We must resolve to something concrete at this point, no +	 * forward types or similar that would resolve to size of +	 * zero is allowed. +	 */ +	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { +		btf_verifier_log_type(env, v->t, "Invalid type_id"); +		return -EINVAL; +	} + +	env_stack_pop_resolved(env, next_type_id, next_type_size); + +	return 0; +} +  static int btf_ptr_resolve(struct btf_verifier_env *env,  			   const struct resolve_vertex *v)  { @@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,  	struct btf *btf = env->btf;  	next_type = btf_type_by_id(btf, next_type_id); -	if (!next_type) { +	if (!next_type || btf_type_is_resolve_source_only(next_type)) {  		btf_verifier_log_type(env, v->t, "Invalid type_id");  		return -EINVAL;  	} @@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf,  	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);  } +static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, +			     u32 type_id, void *data, u8 bits_offset, +			     struct seq_file *m) +{ +	t = btf_type_id_resolve(btf, &type_id); + +	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} +  static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,  			     u32 type_id, void *data, u8 bits_offset,  			     struct seq_file *m) @@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,  	/* Check array->index_type */  	index_type_id = array->index_type;  	index_type = btf_type_by_id(btf, index_type_id); -	if (btf_type_nosize_or_null(index_type)) { +	if (btf_type_is_resolve_source_only(index_type) || +	    btf_type_nosize_or_null(index_type)) {  		btf_verifier_log_type(env, v->t, "Invalid index");  		return -EINVAL;  	} @@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,  	/* Check array->type */  	elem_type_id = array->type;  	elem_type = btf_type_by_id(btf, elem_type_id); -	if (btf_type_nosize_or_null(elem_type)) { +	if (btf_type_is_resolve_source_only(elem_type) || +	    btf_type_nosize_or_null(elem_type)) {  		btf_verifier_log_type(env, v->t,  				      "Invalid elem");  		return -EINVAL; @@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,  		const struct btf_type *member_type = btf_type_by_id(env->btf,  								member_type_id); -		if (btf_type_nosize_or_null(member_type)) { +		if (btf_type_is_resolve_source_only(member_type) || +		    btf_type_nosize_or_null(member_type)) {  			btf_verifier_log_member(env, v->t, member,  						"Invalid member");  			return -EINVAL; @@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = {  	.seq_show = btf_df_seq_show,  }; +static s32 btf_var_check_meta(struct btf_verifier_env *env, +			      const struct btf_type *t, +			      u32 meta_left) +{ +	const struct btf_var *var; +	u32 meta_needed = sizeof(*var); + +	if (meta_left < meta_needed) { +		btf_verifier_log_basic(env, t, +				       "meta_left:%u meta_needed:%u", +				       meta_left, meta_needed); +		return -EINVAL; +	} + +	if (btf_type_vlen(t)) { +		btf_verifier_log_type(env, t, "vlen != 0"); +		return -EINVAL; +	} + +	if (btf_type_kflag(t)) { +		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +		return -EINVAL; +	} + +	if (!t->name_off || +	    !__btf_name_valid(env->btf, t->name_off, true)) { +		btf_verifier_log_type(env, t, "Invalid name"); +		return -EINVAL; +	} + +	/* A var cannot be in type void */ +	if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { +		btf_verifier_log_type(env, t, "Invalid type_id"); +		return -EINVAL; +	} + +	var = btf_type_var(t); +	if (var->linkage != BTF_VAR_STATIC && +	    var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { +		btf_verifier_log_type(env, t, "Linkage not supported"); +		return -EINVAL; +	} + +	btf_verifier_log_type(env, t, NULL); + +	return meta_needed; +} + +static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +{ +	const struct btf_var *var = btf_type_var(t); + +	btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +} + +static const struct btf_kind_operations var_ops = { +	.check_meta		= btf_var_check_meta, +	.resolve		= btf_var_resolve, +	.check_member		= btf_df_check_member, +	.check_kflag_member	= btf_df_check_kflag_member, +	.log_details		= btf_var_log, +	.seq_show		= btf_var_seq_show, +}; + +static s32 btf_datasec_check_meta(struct btf_verifier_env *env, +				  const struct btf_type *t, +				  u32 meta_left) +{ +	const struct btf_var_secinfo *vsi; +	u64 last_vsi_end_off = 0, sum = 0; +	u32 i, meta_needed; + +	meta_needed = btf_type_vlen(t) * sizeof(*vsi); +	if (meta_left < meta_needed) { +		btf_verifier_log_basic(env, t, +				       "meta_left:%u meta_needed:%u", +				       meta_left, meta_needed); +		return -EINVAL; +	} + +	if (!btf_type_vlen(t)) { +		btf_verifier_log_type(env, t, "vlen == 0"); +		return -EINVAL; +	} + +	if (!t->size) { +		btf_verifier_log_type(env, t, "size == 0"); +		return -EINVAL; +	} + +	if (btf_type_kflag(t)) { +		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +		return -EINVAL; +	} + +	if (!t->name_off || +	    !btf_name_valid_section(env->btf, t->name_off)) { +		btf_verifier_log_type(env, t, "Invalid name"); +		return -EINVAL; +	} + +	btf_verifier_log_type(env, t, NULL); + +	for_each_vsi(i, t, vsi) { +		/* A var cannot be in type void */ +		if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { +			btf_verifier_log_vsi(env, t, vsi, +					     "Invalid type_id"); +			return -EINVAL; +		} + +		if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { +			btf_verifier_log_vsi(env, t, vsi, +					     "Invalid offset"); +			return -EINVAL; +		} + +		if (!vsi->size || vsi->size > t->size) { +			btf_verifier_log_vsi(env, t, vsi, +					     "Invalid size"); +			return -EINVAL; +		} + +		last_vsi_end_off = vsi->offset + vsi->size; +		if (last_vsi_end_off > t->size) { +			btf_verifier_log_vsi(env, t, vsi, +					     "Invalid offset+size"); +			return -EINVAL; +		} + +		btf_verifier_log_vsi(env, t, vsi, NULL); +		sum += vsi->size; +	} + +	if (t->size < sum) { +		btf_verifier_log_type(env, t, "Invalid btf_info size"); +		return -EINVAL; +	} + +	return meta_needed; +} + +static int btf_datasec_resolve(struct btf_verifier_env *env, +			       const struct resolve_vertex *v) +{ +	const struct btf_var_secinfo *vsi; +	struct btf *btf = env->btf; +	u16 i; + +	for_each_vsi_from(i, v->next_member, v->t, vsi) { +		u32 var_type_id = vsi->type, type_id, type_size = 0; +		const struct btf_type *var_type = btf_type_by_id(env->btf, +								 var_type_id); +		if (!var_type || !btf_type_is_var(var_type)) { +			btf_verifier_log_vsi(env, v->t, vsi, +					     "Not a VAR kind member"); +			return -EINVAL; +		} + +		if (!env_type_is_resolve_sink(env, var_type) && +		    !env_type_is_resolved(env, var_type_id)) { +			env_stack_set_next_member(env, i + 1); +			return env_stack_push(env, var_type, var_type_id); +		} + +		type_id = var_type->type; +		if (!btf_type_id_size(btf, &type_id, &type_size)) { +			btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); +			return -EINVAL; +		} + +		if (vsi->size < type_size) { +			btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); +			return -EINVAL; +		} +	} + +	env_stack_pop_resolved(env, 0, 0); +	return 0; +} + +static void btf_datasec_log(struct btf_verifier_env *env, +			    const struct btf_type *t) +{ +	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_datasec_seq_show(const struct btf *btf, +				 const struct btf_type *t, u32 type_id, +				 void *data, u8 bits_offset, +				 struct seq_file *m) +{ +	const struct btf_var_secinfo *vsi; +	const struct btf_type *var; +	u32 i; + +	seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); +	for_each_vsi(i, t, vsi) { +		var = btf_type_by_id(btf, vsi->type); +		if (i) +			seq_puts(m, ","); +		btf_type_ops(var)->seq_show(btf, var, vsi->type, +					    data + vsi->offset, bits_offset, m); +	} +	seq_puts(m, "}"); +} + +static const struct btf_kind_operations datasec_ops = { +	.check_meta		= btf_datasec_check_meta, +	.resolve		= btf_datasec_resolve, +	.check_member		= btf_df_check_member, +	.check_kflag_member	= btf_df_check_kflag_member, +	.log_details		= btf_datasec_log, +	.seq_show		= btf_datasec_seq_show, +}; +  static int btf_func_proto_check(struct btf_verifier_env *env,  				const struct btf_type *t)  { @@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {  	[BTF_KIND_RESTRICT] = &modifier_ops,  	[BTF_KIND_FUNC] = &func_ops,  	[BTF_KIND_FUNC_PROTO] = &func_proto_ops, +	[BTF_KIND_VAR] = &var_ops, +	[BTF_KIND_DATASEC] = &datasec_ops,  };  static s32 btf_check_meta(struct btf_verifier_env *env, @@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,  	if (!env_type_is_resolved(env, type_id))  		return false; -	if (btf_type_is_struct(t)) +	if (btf_type_is_struct(t) || btf_type_is_datasec(t))  		return !btf->resolved_ids[type_id] && -			!btf->resolved_sizes[type_id]; +		       !btf->resolved_sizes[type_id]; -	if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { +	if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || +	    btf_type_is_var(t)) {  		t = btf_type_id_resolve(btf, &type_id); -		return t && !btf_type_is_modifier(t); +		return t && +		       !btf_type_is_modifier(t) && +		       !btf_type_is_var(t) && +		       !btf_type_is_datasec(t);  	}  	if (btf_type_is_array(t)) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4e807973aa80..fcde0f7b2585 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,10 @@  #include <linux/kernel.h>  #include <linux/atomic.h>  #include <linux/cgroup.h> +#include <linux/filter.h>  #include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/string.h>  #include <linux/bpf.h>  #include <linux/bpf-cgroup.h>  #include <net/sock.h> @@ -701,7 +704,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,  EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);  static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  {  	switch (func_id) {  	case BPF_FUNC_map_lookup_elem: @@ -710,6 +713,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_map_update_elem_proto;  	case BPF_FUNC_map_delete_elem:  		return &bpf_map_delete_elem_proto; +	case BPF_FUNC_map_push_elem: +		return &bpf_map_push_elem_proto; +	case BPF_FUNC_map_pop_elem: +		return &bpf_map_pop_elem_proto; +	case BPF_FUNC_map_peek_elem: +		return &bpf_map_peek_elem_proto;  	case BPF_FUNC_get_current_uid_gid:  		return &bpf_get_current_uid_gid_proto;  	case BPF_FUNC_get_local_storage: @@ -725,6 +734,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  	}  } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ +	return cgroup_base_func_proto(func_id, prog); +} +  static bool cgroup_dev_is_valid_access(int off, int size,  				       enum bpf_access_type type,  				       const struct bpf_prog *prog, @@ -762,3 +777,356 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {  	.get_func_proto		= cgroup_dev_func_proto,  	.is_valid_access	= cgroup_dev_is_valid_access,  }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + *	result is size of @new_buf if program set new value, initial value + *	otherwise + * @ppos: value-result argument: value is position at which read from or write + *	to sysctl is happening, result is new position if program overrode it, + *	initial value otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + *	overrides new value provided by user space on sysctl write + *	NOTE: it's caller responsibility to free *new_buf if it was set + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, +				   struct ctl_table *table, int write, +				   void __user *buf, size_t *pcount, +				   loff_t *ppos, void **new_buf, +				   enum bpf_attach_type type) +{ +	struct bpf_sysctl_kern ctx = { +		.head = head, +		.table = table, +		.write = write, +		.ppos = ppos, +		.cur_val = NULL, +		.cur_len = PAGE_SIZE, +		.new_val = NULL, +		.new_len = 0, +		.new_updated = 0, +	}; +	struct cgroup *cgrp; +	int ret; + +	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); +	if (ctx.cur_val) { +		mm_segment_t old_fs; +		loff_t pos = 0; + +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, +					&ctx.cur_len, &pos)) { +			/* Let BPF program decide how to proceed. */ +			ctx.cur_len = 0; +		} +		set_fs(old_fs); +	} else { +		/* Let BPF program decide how to proceed. */ +		ctx.cur_len = 0; +	} + +	if (write && buf && *pcount) { +		/* BPF program should be able to override new value with a +		 * buffer bigger than provided by user. +		 */ +		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); +		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); +		if (!ctx.new_val || +		    copy_from_user(ctx.new_val, buf, ctx.new_len)) +			/* Let BPF program decide how to proceed. */ +			ctx.new_len = 0; +	} + +	rcu_read_lock(); +	cgrp = task_dfl_cgroup(current); +	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); +	rcu_read_unlock(); + +	kfree(ctx.cur_val); + +	if (ret == 1 && ctx.new_updated) { +		*new_buf = ctx.new_val; +		*pcount = ctx.new_len; +	} else { +		kfree(ctx.new_val); +	} + +	return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, +			      size_t *lenp) +{ +	ssize_t tmp_ret = 0, ret; + +	if (dir->header.parent) { +		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); +		if (tmp_ret < 0) +			return tmp_ret; +	} + +	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); +	if (ret < 0) +		return ret; +	*bufp += ret; +	*lenp -= ret; +	ret += tmp_ret; + +	/* Avoid leading slash. */ +	if (!ret) +		return ret; + +	tmp_ret = strscpy(*bufp, "/", *lenp); +	if (tmp_ret < 0) +		return tmp_ret; +	*bufp += tmp_ret; +	*lenp -= tmp_ret; + +	return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, +	   size_t, buf_len, u64, flags) +{ +	ssize_t tmp_ret = 0, ret; + +	if (!buf) +		return -EINVAL; + +	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { +		if (!ctx->head) +			return -EINVAL; +		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); +		if (tmp_ret < 0) +			return tmp_ret; +	} + +	ret = strscpy(buf, ctx->table->procname, buf_len); + +	return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { +	.func		= bpf_sysctl_get_name, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_MEM, +	.arg3_type	= ARG_CONST_SIZE, +	.arg4_type	= ARG_ANYTHING, +}; + +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, +			     size_t src_len) +{ +	if (!dst) +		return -EINVAL; + +	if (!dst_len) +		return -E2BIG; + +	if (!src || !src_len) { +		memset(dst, 0, dst_len); +		return -EINVAL; +	} + +	memcpy(dst, src, min(dst_len, src_len)); + +	if (dst_len > src_len) { +		memset(dst + src_len, '\0', dst_len - src_len); +		return src_len; +	} + +	dst[dst_len - 1] = '\0'; + +	return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, +	   char *, buf, size_t, buf_len) +{ +	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { +	.func		= bpf_sysctl_get_current_value, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM, +	.arg3_type	= ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, +	   size_t, buf_len) +{ +	if (!ctx->write) { +		if (buf && buf_len) +			memset(buf, '\0', buf_len); +		return -EINVAL; +	} +	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { +	.func		= bpf_sysctl_get_new_value, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM, +	.arg3_type	= ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, +	   const char *, buf, size_t, buf_len) +{ +	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) +		return -EINVAL; + +	if (buf_len > PAGE_SIZE - 1) +		return -E2BIG; + +	memcpy(ctx->new_val, buf, buf_len); +	ctx->new_len = buf_len; +	ctx->new_updated = 1; + +	return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { +	.func		= bpf_sysctl_set_new_value, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_MEM, +	.arg3_type	= ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ +	switch (func_id) { +	case BPF_FUNC_strtol: +		return &bpf_strtol_proto; +	case BPF_FUNC_strtoul: +		return &bpf_strtoul_proto; +	case BPF_FUNC_sysctl_get_name: +		return &bpf_sysctl_get_name_proto; +	case BPF_FUNC_sysctl_get_current_value: +		return &bpf_sysctl_get_current_value_proto; +	case BPF_FUNC_sysctl_get_new_value: +		return &bpf_sysctl_get_new_value_proto; +	case BPF_FUNC_sysctl_set_new_value: +		return &bpf_sysctl_set_new_value_proto; +	default: +		return cgroup_base_func_proto(func_id, prog); +	} +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, +				   const struct bpf_prog *prog, +				   struct bpf_insn_access_aux *info) +{ +	const int size_default = sizeof(__u32); + +	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) +		return false; + +	switch (off) { +	case offsetof(struct bpf_sysctl, write): +		if (type != BPF_READ) +			return false; +		bpf_ctx_record_field_size(info, size_default); +		return bpf_ctx_narrow_access_ok(off, size, size_default); +	case offsetof(struct bpf_sysctl, file_pos): +		if (type == BPF_READ) { +			bpf_ctx_record_field_size(info, size_default); +			return bpf_ctx_narrow_access_ok(off, size, size_default); +		} else { +			return size == size_default; +		} +	default: +		return false; +	} +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, +				     const struct bpf_insn *si, +				     struct bpf_insn *insn_buf, +				     struct bpf_prog *prog, u32 *target_size) +{ +	struct bpf_insn *insn = insn_buf; + +	switch (si->off) { +	case offsetof(struct bpf_sysctl, write): +		*insn++ = BPF_LDX_MEM( +			BPF_SIZE(si->code), si->dst_reg, si->src_reg, +			bpf_target_off(struct bpf_sysctl_kern, write, +				       FIELD_SIZEOF(struct bpf_sysctl_kern, +						    write), +				       target_size)); +		break; +	case offsetof(struct bpf_sysctl, file_pos): +		/* ppos is a pointer so it should be accessed via indirect +		 * loads and stores. Also for stores additional temporary +		 * register is used since neither src_reg nor dst_reg can be +		 * overridden. +		 */ +		if (type == BPF_WRITE) { +			int treg = BPF_REG_9; + +			if (si->src_reg == treg || si->dst_reg == treg) +				--treg; +			if (si->src_reg == treg || si->dst_reg == treg) +				--treg; +			*insn++ = BPF_STX_MEM( +				BPF_DW, si->dst_reg, treg, +				offsetof(struct bpf_sysctl_kern, tmp_reg)); +			*insn++ = BPF_LDX_MEM( +				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), +				treg, si->dst_reg, +				offsetof(struct bpf_sysctl_kern, ppos)); +			*insn++ = BPF_STX_MEM( +				BPF_SIZEOF(u32), treg, si->src_reg, 0); +			*insn++ = BPF_LDX_MEM( +				BPF_DW, treg, si->dst_reg, +				offsetof(struct bpf_sysctl_kern, tmp_reg)); +		} else { +			*insn++ = BPF_LDX_MEM( +				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), +				si->dst_reg, si->src_reg, +				offsetof(struct bpf_sysctl_kern, ppos)); +			*insn++ = BPF_LDX_MEM( +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); +		} +		*target_size = sizeof(u32); +		break; +	} + +	return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { +	.get_func_proto		= sysctl_func_proto, +	.is_valid_access	= sysctl_is_valid_access, +	.convert_ctx_access	= sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..7c473f208a10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * Linux Socket Filter - Kernel level socket filtering   * @@ -12,11 +13,6 @@   *	Alexei Starovoitov <ast@plumgrid.com>   *	Daniel Borkmann <dborkman@redhat.com>   * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - *   * Andi Kleen - Fix a few bad bugs and races.   * Kris Katterjohn - Added many additional checks in bpf_check_classic()   */ @@ -292,7 +288,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)  		dst[i] = fp->insnsi[i];  		if (!was_ld_map &&  		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && -		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) { +		    (dst[i].src_reg == BPF_PSEUDO_MAP_FD || +		     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {  			was_ld_map = true;  			dst[i].imm = 0;  		} else if (was_ld_map && @@ -337,7 +334,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)  }  static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, -				s32 end_new, u32 curr, const bool probe_pass) +				s32 end_new, s32 curr, const bool probe_pass)  {  	const s64 imm_min = S32_MIN, imm_max = S32_MAX;  	s32 delta = end_new - end_old; @@ -355,7 +352,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,  }  static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, -				s32 end_new, u32 curr, const bool probe_pass) +				s32 end_new, s32 curr, const bool probe_pass)  {  	const s32 off_min = S16_MIN, off_max = S16_MAX;  	s32 delta = end_new - end_old; @@ -438,6 +435,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,  	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;  	const u32 cnt_max = S16_MAX;  	struct bpf_prog *prog_adj; +	int err;  	/* Since our patchlet doesn't expand the image, we're done. */  	if (insn_delta == 0) { @@ -453,8 +451,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,  	 * we afterwards may not fail anymore.  	 */  	if (insn_adj_cnt > cnt_max && -	    bpf_adj_branches(prog, off, off + 1, off + len, true)) -		return NULL; +	    (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) +		return ERR_PTR(err);  	/* Several new instructions need to be inserted. Make room  	 * for them. Likely, there's no need for a new allocation as @@ -463,7 +461,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,  	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),  				    GFP_USER);  	if (!prog_adj) -		return NULL; +		return ERR_PTR(-ENOMEM);  	prog_adj->len = insn_adj_cnt; @@ -848,7 +846,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)  	if (fp->jited) {  		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); -		bpf_jit_binary_unlock_ro(hdr);  		bpf_jit_binary_free(hdr);  		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); @@ -1096,13 +1093,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)  			continue;  		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); -		if (!tmp) { +		if (IS_ERR(tmp)) {  			/* Patching may have repointed aux->prog during  			 * realloc from the original one, so we need to  			 * fix it up here on error.  			 */  			bpf_jit_prog_release_other(prog, clone); -			return ERR_PTR(-ENOMEM); +			return tmp;  		}  		clone = tmp; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8974b3755670..cf727d77c6c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -160,11 +160,15 @@ static void cpu_map_kthread_stop(struct work_struct *work)  }  static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, -					 struct xdp_frame *xdpf) +					 struct xdp_frame *xdpf, +					 struct sk_buff *skb)  { +	unsigned int hard_start_headroom;  	unsigned int frame_size;  	void *pkt_data_start; -	struct sk_buff *skb; + +	/* Part of headroom was reserved to xdpf */ +	hard_start_headroom = sizeof(struct xdp_frame) +  xdpf->headroom;  	/* build_skb need to place skb_shared_info after SKB end, and  	 * also want to know the memory "truesize".  Thus, need to @@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,  	 * is not at a fixed memory location, with mixed length  	 * packets, which is bad for cache-line hotness.  	 */ -	frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + +	frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +  		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); -	pkt_data_start = xdpf->data - xdpf->headroom; -	skb = build_skb(pkt_data_start, frame_size); -	if (!skb) +	pkt_data_start = xdpf->data - hard_start_headroom; +	skb = build_skb_around(skb, pkt_data_start, frame_size); +	if (unlikely(!skb))  		return NULL; -	skb_reserve(skb, xdpf->headroom); +	skb_reserve(skb, hard_start_headroom);  	__skb_put(skb, xdpf->len);  	if (xdpf->metasize)  		skb_metadata_set(skb, xdpf->metasize); @@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,  	 * - RX ring dev queue index	(skb_record_rx_queue)  	 */ +	/* Allow SKB to reuse area used by xdp_frame */ +	xdp_scrub_frame(xdpf); +  	return skb;  } @@ -233,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)  	}  } +#define CPUMAP_BATCH 8 +  static int cpu_map_kthread_run(void *data)  {  	struct bpf_cpu_map_entry *rcpu = data; @@ -245,8 +254,11 @@ static int cpu_map_kthread_run(void *data)  	 * kthread_stop signal until queue is empty.  	 */  	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { -		unsigned int processed = 0, drops = 0, sched = 0; -		struct xdp_frame *xdpf; +		unsigned int drops = 0, sched = 0; +		void *frames[CPUMAP_BATCH]; +		void *skbs[CPUMAP_BATCH]; +		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; +		int i, n, m;  		/* Release CPU reschedule checks */  		if (__ptr_ring_empty(rcpu->queue)) { @@ -262,18 +274,38 @@ static int cpu_map_kthread_run(void *data)  			sched = cond_resched();  		} -		/* Process packets in rcpu->queue */ -		local_bh_disable();  		/*  		 * The bpf_cpu_map_entry is single consumer, with this  		 * kthread CPU pinned. Lockless access to ptr_ring  		 * consume side valid as no-resize allowed of queue.  		 */ -		while ((xdpf = __ptr_ring_consume(rcpu->queue))) { -			struct sk_buff *skb; +		n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + +		for (i = 0; i < n; i++) { +			void *f = frames[i]; +			struct page *page = virt_to_page(f); + +			/* Bring struct page memory area to curr CPU. Read by +			 * build_skb_around via page_is_pfmemalloc(), and when +			 * freed written by page_frag_free call. +			 */ +			prefetchw(page); +		} + +		m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); +		if (unlikely(m == 0)) { +			for (i = 0; i < n; i++) +				skbs[i] = NULL; /* effect: xdp_return_frame */ +			drops = n; +		} + +		local_bh_disable(); +		for (i = 0; i < n; i++) { +			struct xdp_frame *xdpf = frames[i]; +			struct sk_buff *skb = skbs[i];  			int ret; -			skb = cpu_map_build_skb(rcpu, xdpf); +			skb = cpu_map_build_skb(rcpu, xdpf, skb);  			if (!skb) {  				xdp_return_frame(xdpf);  				continue; @@ -283,13 +315,9 @@ static int cpu_map_kthread_run(void *data)  			ret = netif_receive_skb_core(skb);  			if (ret == NET_RX_DROP)  				drops++; - -			/* Limit BH-disable period */ -			if (++processed == 8) -				break;  		}  		/* Feedback loop via tracepoint */ -		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); +		trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);  		local_bh_enable(); /* resched point, may call do_softirq() */  	} diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 191b79948424..1e525d70f833 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -164,6 +164,9 @@ static void dev_map_free(struct bpf_map *map)  	bpf_clear_redirect_map(map);  	synchronize_rcu(); +	/* Make sure prior __dev_map_entry_free() have completed. */ +	rcu_barrier(); +  	/* To ensure all pending flush operations have completed wait for flush  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.  	 * Because the above synchronize_rcu() ensures the map is disconnected diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index de73f55e42fd..d9ce383c0f9c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,  			 * part of the ldimm64 insn is accessible.  			 */  			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; -			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; +			bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || +				      insn->src_reg == BPF_PSEUDO_MAP_VALUE;  			char tmp[64]; -			if (map_ptr && !allow_ptr_leaks) +			if (is_ptr && !allow_ptr_leaks)  				imm = 0;  			verbose(cbs->private_data, "(%02x) r%d = %s\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fed15cf94dca..0f2708fde5f7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@  #define HTAB_CREATE_FLAG_MASK						\  	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\ -	 BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) +	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)  struct bucket {  	struct hlist_nulls_head head; @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr)  		/* Guard against local DoS, and discourage production use. */  		return -EPERM; -	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) -		/* reserved bits should not be used */ +	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || +	    !bpf_map_flags_access_ok(attr->map_flags))  		return -EINVAL;  	if (!lru && percpu_lru) @@ -527,18 +527,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  	return insn - insn_buf;  } -static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, +							void *key, const bool mark)  {  	struct htab_elem *l = __htab_map_lookup_elem(map, key);  	if (l) { -		bpf_lru_node_set_ref(&l->lru_node); +		if (mark) +			bpf_lru_node_set_ref(&l->lru_node);  		return l->key + round_up(map->key_size, 8);  	}  	return NULL;  } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ +	return __htab_lru_map_lookup_elem(map, key, true); +} + +static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) +{ +	return __htab_lru_map_lookup_elem(map, key, false); +} +  static u32 htab_lru_map_gen_lookup(struct bpf_map *map,  				   struct bpf_insn *insn_buf)  { @@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = {  	.map_free = htab_map_free,  	.map_get_next_key = htab_map_get_next_key,  	.map_lookup_elem = htab_lru_map_lookup_elem, +	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,  	.map_update_elem = htab_lru_map_update_elem,  	.map_delete_elem = htab_lru_map_delete_elem,  	.map_gen_lookup = htab_lru_map_gen_lookup, @@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  { -	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l;  	void __percpu *pptr;  	int ret = -ENOENT; @@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  	l = __htab_map_lookup_elem(map, key);  	if (!l)  		goto out; -	if (htab_is_lru(htab)) -		bpf_lru_node_set_ref(&l->lru_node); +	/* We do not mark LRU map element here in order to not mess up +	 * eviction heuristics when user space does a map walk. +	 */  	pptr = htab_elem_get_ptr(l, map->key_size);  	for_each_possible_cpu(cpu) {  		bpf_long_memcpy(value + off, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a411fc17d265..4266ffde07ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@  #include <linux/sched.h>  #include <linux/uidgid.h>  #include <linux/filter.h> +#include <linux/ctype.h> + +#include "../../lib/kstrtox.h"  /* If kernel subsystem is allowing eBPF programs to call this function,   * inside its own verifier_ops->get_func_proto() callback it should return @@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {  	.arg2_type	= ARG_ANYTHING,  };  #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, +			  unsigned long long *res, bool *is_negative) +{ +	unsigned int base = flags & BPF_STRTOX_BASE_MASK; +	const char *cur_buf = buf; +	size_t cur_len = buf_len; +	unsigned int consumed; +	size_t val_len; +	char str[64]; + +	if (!buf || !buf_len || !res || !is_negative) +		return -EINVAL; + +	if (base != 0 && base != 8 && base != 10 && base != 16) +		return -EINVAL; + +	if (flags & ~BPF_STRTOX_BASE_MASK) +		return -EINVAL; + +	while (cur_buf < buf + buf_len && isspace(*cur_buf)) +		++cur_buf; + +	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); +	if (*is_negative) +		++cur_buf; + +	consumed = cur_buf - buf; +	cur_len -= consumed; +	if (!cur_len) +		return -EINVAL; + +	cur_len = min(cur_len, sizeof(str) - 1); +	memcpy(str, cur_buf, cur_len); +	str[cur_len] = '\0'; +	cur_buf = str; + +	cur_buf = _parse_integer_fixup_radix(cur_buf, &base); +	val_len = _parse_integer(cur_buf, base, res); + +	if (val_len & KSTRTOX_OVERFLOW) +		return -ERANGE; + +	if (val_len == 0) +		return -EINVAL; + +	cur_buf += val_len; +	consumed += cur_buf - str; + +	return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, +			 long long *res) +{ +	unsigned long long _res; +	bool is_negative; +	int err; + +	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); +	if (err < 0) +		return err; +	if (is_negative) { +		if ((long long)-_res > 0) +			return -ERANGE; +		*res = -_res; +	} else { +		if ((long long)_res < 0) +			return -ERANGE; +		*res = _res; +	} +	return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, +	   long *, res) +{ +	long long _res; +	int err; + +	err = __bpf_strtoll(buf, buf_len, flags, &_res); +	if (err < 0) +		return err; +	if (_res != (long)_res) +		return -ERANGE; +	*res = _res; +	return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { +	.func		= bpf_strtol, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_MEM, +	.arg2_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_ANYTHING, +	.arg4_type	= ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, +	   unsigned long *, res) +{ +	unsigned long long _res; +	bool is_negative; +	int err; + +	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); +	if (err < 0) +		return err; +	if (is_negative) +		return -EINVAL; +	if (_res != (unsigned long)_res) +		return -ERANGE; +	*res = _res; +	return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { +	.func		= bpf_strtoul, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_MEM, +	.arg2_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_ANYTHING, +	.arg4_type	= ARG_PTR_TO_LONG, +};  #endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2ada5e21dfa6..84a80b02db99 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -518,7 +518,7 @@ out:  static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)  {  	struct bpf_prog *prog; -	int ret = inode_permission(inode, MAY_READ | MAY_WRITE); +	int ret = inode_permission(inode, MAY_READ);  	if (ret)  		return ERR_PTR(ret); @@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ  }  EXPORT_SYMBOL(bpf_prog_get_type_path); -static void bpf_evict_inode(struct inode *inode) -{ -	enum bpf_type type; - -	truncate_inode_pages_final(&inode->i_data); -	clear_inode(inode); - -	if (S_ISLNK(inode->i_mode)) -		kfree(inode->i_link); -	if (!bpf_inode_type(inode, &type)) -		bpf_any_put(inode->i_private, type); -} -  /*   * Display the mount options in /proc/mounts.   */ @@ -579,11 +566,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)  	return 0;  } +static void bpf_free_inode(struct inode *inode) +{ +	enum bpf_type type; + +	if (S_ISLNK(inode->i_mode)) +		kfree(inode->i_link); +	if (!bpf_inode_type(inode, &type)) +		bpf_any_put(inode->i_private, type); +	free_inode_nonrcu(inode); +} +  static const struct super_operations bpf_super_ops = {  	.statfs		= simple_statfs,  	.drop_inode	= generic_delete_inode,  	.show_options	= bpf_show_options, -	.evict_inode	= bpf_evict_inode, +	.free_inode	= bpf_free_inode,  };  enum { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6b572e2de7fb..980e8f1f6cb5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO  #ifdef CONFIG_CGROUP_BPF  #define LOCAL_STORAGE_CREATE_FLAG_MASK					\ -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)  struct bpf_cgroup_storage_map {  	struct bpf_map map; @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)  	if (attr->value_size > PAGE_SIZE)  		return ERR_PTR(-E2BIG); -	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) -		/* reserved bits should not be used */ +	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || +	    !bpf_map_flags_access_ok(attr->map_flags))  		return ERR_PTR(-EINVAL);  	if (attr->max_entries) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 93a5cbbde421..e61630c2e50b 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -538,7 +538,7 @@ out:  #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)  #define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\ -				 BPF_F_RDONLY | BPF_F_WRONLY) +				 BPF_F_ACCESS_MASK)  static struct bpf_map *trie_alloc(union bpf_attr *attr)  { @@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)  	if (attr->max_entries == 0 ||  	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||  	    attr->map_flags & ~LPM_CREATE_FLAG_MASK || +	    !bpf_map_flags_access_ok(attr->map_flags) ||  	    attr->key_size < LPM_KEY_SIZE_MIN ||  	    attr->key_size > LPM_KEY_SIZE_MAX ||  	    attr->value_size < LPM_VAL_SIZE_MIN || diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3dff41403583..fab4fb134547 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only  /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #include <linux/slab.h>  #include <linux/bpf.h> diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9ec08c..a507bf6ef8b9 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */  /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #ifndef __MAP_IN_MAP_H__  #define __MAP_IN_MAP_H__ diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..6e090140b924 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only  /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #include "percpu_freelist.h" diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index c3960118e617..fbf8a8a28979 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */  /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #ifndef __PERCPU_FREELIST_H__  #define __PERCPU_FREELIST_H__ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..0b140d236889 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@  #include "percpu_freelist.h"  #define QUEUE_STACK_CREATE_FLAG_MASK \ -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - +	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)  struct bpf_queue_stack {  	struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 0 ||  	    attr->value_size == 0 || -	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) +	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || +	    !bpf_map_flags_access_ok(attr->map_flags))  		return -EINVAL;  	if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..d38e49f943a1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only  /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation.   */  #include <linux/bpf.h>  #include <linux/jhash.h> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 62f6bced3a3c..cb5440b02e82 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)  void *bpf_map_area_alloc(size_t size, int numa_node)  { -	/* We definitely need __GFP_NORETRY, so OOM killer doesn't -	 * trigger under memory pressure as we really just want to -	 * fail instead. +	/* We really just want to fail instead of triggering OOM killer +	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, +	 * which is used for lower order allocation requests. +	 * +	 * It has been observed that higher order allocation requests done by +	 * vmalloc with __GFP_NORETRY being set might fail due to not trying +	 * to reclaim memory from the page cache, thus we set +	 * __GFP_RETRY_MAYFAIL to avoid such situations.  	 */ -	const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + +	const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;  	void *area;  	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { -		area = kmalloc_node(size, GFP_USER | flags, numa_node); +		area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, +				    numa_node);  		if (area != NULL)  			return area;  	} -	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, -					   __builtin_return_address(0)); +	return __vmalloc_node_flags_caller(size, numa_node, +					   GFP_KERNEL | __GFP_RETRY_MAYFAIL | +					   flags, __builtin_return_address(0));  }  void bpf_map_area_free(void *area) @@ -158,13 +166,25 @@ void bpf_map_area_free(void *area)  	kvfree(area);  } +static u32 bpf_map_flags_retain_permanent(u32 flags) +{ +	/* Some map creation flags are not tied to the map object but +	 * rather to the map fd instead, so they have no meaning upon +	 * map object inspection since multiple file descriptors with +	 * different (access) properties can exist here. Thus, given +	 * this has zero meaning for the map itself, lets clear these +	 * from here. +	 */ +	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +} +  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)  {  	map->map_type = attr->map_type;  	map->key_size = attr->key_size;  	map->value_size = attr->value_size;  	map->max_entries = attr->max_entries; -	map->map_flags = attr->map_flags; +	map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);  	map->numa_node = bpf_map_attr_numa_node(attr);  } @@ -335,6 +355,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp)  	return 0;  } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ +	fmode_t mode = f.file->f_mode; + +	/* Our file permissions may have been overridden by global +	 * map permissions facing syscall side. +	 */ +	if (READ_ONCE(map->frozen)) +		mode &= ~FMODE_CAN_WRITE; +	return mode; +} +  #ifdef CONFIG_PROC_FS  static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)  { @@ -356,14 +388,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)  		   "max_entries:\t%u\n"  		   "map_flags:\t%#x\n"  		   "memlock:\t%llu\n" -		   "map_id:\t%u\n", +		   "map_id:\t%u\n" +		   "frozen:\t%u\n",  		   map->map_type,  		   map->key_size,  		   map->value_size,  		   map->max_entries,  		   map->map_flags,  		   map->pages * 1ULL << PAGE_SHIFT, -		   map->id); +		   map->id, +		   READ_ONCE(map->frozen));  	if (owner_prog_type) {  		seq_printf(m, "owner_prog_type:\t%u\n", @@ -440,10 +474,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src)  	const char *end = src + BPF_OBJ_NAME_LEN;  	memset(dst, 0, BPF_OBJ_NAME_LEN); - -	/* Copy all isalnum() and '_' char */ +	/* Copy all isalnum(), '_' and '.' chars. */  	while (src < end && *src) { -		if (!isalnum(*src) && *src != '_') +		if (!isalnum(*src) && +		    *src != '_' && *src != '.')  			return -EINVAL;  		*dst++ = *src++;  	} @@ -470,9 +504,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,  	u32 key_size, value_size;  	int ret = 0; -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size); -	if (!key_type || key_size != map->key_size) -		return -EINVAL; +	/* Some maps allow key to be unspecified. */ +	if (btf_key_id) { +		key_type = btf_type_id_size(btf, &btf_key_id, &key_size); +		if (!key_type || key_size != map->key_size) +			return -EINVAL; +	} else { +		key_type = btf_type_by_id(btf, 0); +		if (!map->ops->map_check_btf) +			return -EINVAL; +	}  	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);  	if (!value_type || value_size != map->value_size) @@ -481,9 +522,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,  	map->spin_lock_off = btf_find_spin_lock(btf, value_type);  	if (map_value_has_spin_lock(map)) { +		if (map->map_flags & BPF_F_RDONLY_PROG) +			return -EACCES;  		if (map->map_type != BPF_MAP_TYPE_HASH &&  		    map->map_type != BPF_MAP_TYPE_ARRAY && -		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) +		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && +		    map->map_type != BPF_MAP_TYPE_SK_STORAGE)  			return -ENOTSUPP;  		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >  		    map->value_size) { @@ -537,7 +581,7 @@ static int map_create(union bpf_attr *attr)  	if (attr->btf_key_type_id || attr->btf_value_type_id) {  		struct btf *btf; -		if (!attr->btf_key_type_id || !attr->btf_value_type_id) { +		if (!attr->btf_value_type_id) {  			err = -EINVAL;  			goto free_map_nouncharge;  		} @@ -705,8 +749,7 @@ static int map_lookup_elem(union bpf_attr *attr)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); - -	if (!(f.file->f_mode & FMODE_CAN_READ)) { +	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {  		err = -EPERM;  		goto err_put;  	} @@ -765,7 +808,10 @@ static int map_lookup_elem(union bpf_attr *attr)  		err = map->ops->map_peek_elem(map, value);  	} else {  		rcu_read_lock(); -		ptr = map->ops->map_lookup_elem(map, key); +		if (map->ops->map_lookup_elem_sys_only) +			ptr = map->ops->map_lookup_elem_sys_only(map, key); +		else +			ptr = map->ops->map_lookup_elem(map, key);  		if (IS_ERR(ptr)) {  			err = PTR_ERR(ptr);  		} else if (!ptr) { @@ -835,8 +881,7 @@ static int map_update_elem(union bpf_attr *attr)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); - -	if (!(f.file->f_mode & FMODE_CAN_WRITE)) { +	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {  		err = -EPERM;  		goto err_put;  	} @@ -947,8 +992,7 @@ static int map_delete_elem(union bpf_attr *attr)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); - -	if (!(f.file->f_mode & FMODE_CAN_WRITE)) { +	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {  		err = -EPERM;  		goto err_put;  	} @@ -999,8 +1043,7 @@ static int map_get_next_key(union bpf_attr *attr)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); - -	if (!(f.file->f_mode & FMODE_CAN_READ)) { +	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {  		err = -EPERM;  		goto err_put;  	} @@ -1067,8 +1110,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); - -	if (!(f.file->f_mode & FMODE_CAN_WRITE)) { +	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {  		err = -EPERM;  		goto err_put;  	} @@ -1110,6 +1152,36 @@ err_put:  	return err;  } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ +	int err = 0, ufd = attr->map_fd; +	struct bpf_map *map; +	struct fd f; + +	if (CHECK_ATTR(BPF_MAP_FREEZE)) +		return -EINVAL; + +	f = fdget(ufd); +	map = __bpf_map_get(f); +	if (IS_ERR(map)) +		return PTR_ERR(map); +	if (READ_ONCE(map->frozen)) { +		err = -EBUSY; +		goto err_put; +	} +	if (!capable(CAP_SYS_ADMIN)) { +		err = -EPERM; +		goto err_put; +	} + +	WRITE_ONCE(map->frozen, true); +err_put: +	fdput(f); +	return err; +} +  static const struct bpf_prog_ops * const bpf_prog_types[] = {  #define BPF_PROG_TYPE(_id, _name) \  	[_id] = & _name ## _prog_ops, @@ -1549,7 +1621,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)  	/* eBPF programs must be GPL compatible to use GPL-ed functions */  	is_gpl = license_is_gpl_compatible(license); -	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) +	if (attr->insn_cnt == 0 || +	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))  		return -E2BIG;  	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&  	    type != BPF_PROG_TYPE_CGROUP_SKB && @@ -1720,12 +1793,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)  	}  	raw_tp->btp = btp; -	prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, -				 BPF_PROG_TYPE_RAW_TRACEPOINT); +	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);  	if (IS_ERR(prog)) {  		err = PTR_ERR(prog);  		goto out_free_tp;  	} +	if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && +	    prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { +		err = -EINVAL; +		goto out_put_prog; +	}  	err = bpf_probe_register(raw_tp->btp, prog);  	if (err) @@ -1819,6 +1896,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)  	case BPF_FLOW_DISSECTOR:  		ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;  		break; +	case BPF_CGROUP_SYSCTL: +		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; +		break;  	default:  		return -EINVAL;  	} @@ -1897,6 +1977,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)  		return lirc_prog_detach(attr);  	case BPF_FLOW_DISSECTOR:  		return skb_flow_dissector_bpf_prog_detach(attr); +	case BPF_CGROUP_SYSCTL: +		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; +		break;  	default:  		return -EINVAL;  	} @@ -1930,9 +2013,12 @@ static int bpf_prog_query(const union bpf_attr *attr,  	case BPF_CGROUP_UDP6_SENDMSG:  	case BPF_CGROUP_SOCK_OPS:  	case BPF_CGROUP_DEVICE: +	case BPF_CGROUP_SYSCTL:  		break;  	case BPF_LIRC_MODE2:  		return lirc_prog_query(attr, uattr); +	case BPF_FLOW_DISSECTOR: +		return skb_flow_dissector_prog_query(attr, uattr);  	default:  		return -EINVAL;  	} @@ -1940,7 +2026,7 @@ static int bpf_prog_query(const union bpf_attr *attr,  	return cgroup_bpf_prog_query(attr, uattr);  } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out  static int bpf_prog_test_run(const union bpf_attr *attr,  			     union bpf_attr __user *uattr) @@ -1953,6 +2039,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr,  	if (CHECK_ATTR(BPF_PROG_TEST_RUN))  		return -EINVAL; +	if ((attr->test.ctx_size_in && !attr->test.ctx_in) || +	    (!attr->test.ctx_size_in && attr->test.ctx_in)) +		return -EINVAL; + +	if ((attr->test.ctx_size_out && !attr->test.ctx_out) || +	    (!attr->test.ctx_size_out && attr->test.ctx_out)) +		return -EINVAL; +  	prog = bpf_prog_get(attr->test.prog_fd);  	if (IS_ERR(prog))  		return PTR_ERR(prog); @@ -2063,13 +2157,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)  }  static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, -					      unsigned long addr) +					      unsigned long addr, u32 *off, +					      u32 *type)  { +	const struct bpf_map *map;  	int i; -	for (i = 0; i < prog->aux->used_map_cnt; i++) -		if (prog->aux->used_maps[i] == (void *)addr) -			return prog->aux->used_maps[i]; +	for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { +		map = prog->aux->used_maps[i]; +		if (map == (void *)addr) { +			*type = BPF_PSEUDO_MAP_FD; +			return map; +		} +		if (!map->ops->map_direct_value_meta) +			continue; +		if (!map->ops->map_direct_value_meta(map, addr, off)) { +			*type = BPF_PSEUDO_MAP_VALUE; +			return map; +		} +	} +  	return NULL;  } @@ -2077,6 +2184,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)  {  	const struct bpf_map *map;  	struct bpf_insn *insns; +	u32 off, type;  	u64 imm;  	int i; @@ -2104,11 +2212,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)  			continue;  		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; -		map = bpf_map_from_imm(prog, imm); +		map = bpf_map_from_imm(prog, imm, &off, &type);  		if (map) { -			insns[i].src_reg = BPF_PSEUDO_MAP_FD; +			insns[i].src_reg = type;  			insns[i].imm = map->id; -			insns[i + 1].imm = 0; +			insns[i + 1].imm = off;  			continue;  		}  	} @@ -2698,6 +2806,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	case BPF_MAP_GET_NEXT_KEY:  		err = map_get_next_key(&attr);  		break; +	case BPF_MAP_FREEZE: +		err = map_freeze(&attr); +		break;  	case BPF_PROG_LOAD:  		err = bpf_prog_load(&attr, uattr);  		break; diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 938d41211be7..ca52b9642943 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /* tnum: tracked (or tristate) numbers   *   * A tnum tracks knowledge about the bits of a value.  Each bit can be either diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ce166a002d16..95f9354495ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,6 @@ struct bpf_verifier_stack_elem {  	struct bpf_verifier_stack_elem *next;  }; -#define BPF_COMPLEXITY_LIMIT_INSNS	131072  #define BPF_COMPLEXITY_LIMIT_STACK	1024  #define BPF_COMPLEXITY_LIMIT_STATES	64 @@ -212,7 +211,7 @@ struct bpf_call_arg_meta {  	int access_size;  	s64 msize_smax_value;  	u64 msize_umax_value; -	int ptr_id; +	int ref_obj_id;  	int func_id;  }; @@ -346,35 +345,23 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)  	       type == PTR_TO_TCP_SOCK_OR_NULL;  } -static bool type_is_refcounted(enum bpf_reg_type type) -{ -	return type == PTR_TO_SOCKET; -} - -static bool type_is_refcounted_or_null(enum bpf_reg_type type) -{ -	return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; -} - -static bool reg_is_refcounted(const struct bpf_reg_state *reg) -{ -	return type_is_refcounted(reg->type); -} -  static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)  {  	return reg->type == PTR_TO_MAP_VALUE &&  		map_value_has_spin_lock(reg->map_ptr);  } -static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)  { -	return type_is_refcounted_or_null(reg->type); +	return type == PTR_TO_SOCKET || +		type == PTR_TO_SOCKET_OR_NULL || +		type == PTR_TO_TCP_SOCK || +		type == PTR_TO_TCP_SOCK_OR_NULL;  } -static bool arg_type_is_refcounted(enum bpf_arg_type type) +static bool arg_type_may_be_refcounted(enum bpf_arg_type type)  { -	return type == ARG_PTR_TO_SOCKET; +	return type == ARG_PTR_TO_SOCK_COMMON;  }  /* Determine whether the function releases some resources allocated by another @@ -389,7 +376,14 @@ static bool is_release_function(enum bpf_func_id func_id)  static bool is_acquire_function(enum bpf_func_id func_id)  {  	return func_id == BPF_FUNC_sk_lookup_tcp || -		func_id == BPF_FUNC_sk_lookup_udp; +		func_id == BPF_FUNC_sk_lookup_udp || +		func_id == BPF_FUNC_skc_lookup_tcp; +} + +static bool is_ptr_cast_function(enum bpf_func_id func_id) +{ +	return func_id == BPF_FUNC_tcp_sock || +		func_id == BPF_FUNC_sk_fullsock;  }  /* string representation of 'enum bpf_reg_type' */ @@ -411,6 +405,7 @@ static const char * const reg_type_str[] = {  	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",  	[PTR_TO_TCP_SOCK]	= "tcp_sock",  	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", +	[PTR_TO_TP_BUFFER]	= "tp_buffer",  };  static char slot_type_char[] = { @@ -466,6 +461,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,  				verbose(env, ",call_%d", func(env, reg)->callsite);  		} else {  			verbose(env, "(id=%d", reg->id); +			if (reg_type_may_be_refcounted_or_null(t)) +				verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);  			if (t != SCALAR_VALUE)  				verbose(env, ",off=%d", reg->off);  			if (type_is_pkt_pointer(t)) @@ -1095,7 +1092,7 @@ static int check_subprogs(struct bpf_verifier_env *env)  	 */  	subprog[env->subprog_cnt].start = insn_cnt; -	if (env->log.level > 1) +	if (env->log.level & BPF_LOG_LEVEL2)  		for (i = 0; i < env->subprog_cnt; i++)  			verbose(env, "func#%d @%d\n", i, subprog[i].start); @@ -1142,6 +1139,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,  			 struct bpf_reg_state *parent)  {  	bool writes = parent == state->parent; /* Observe write marks */ +	int cnt = 0;  	while (parent) {  		/* if read wasn't screened by an earlier write ... */ @@ -1153,12 +1151,25 @@ static int mark_reg_read(struct bpf_verifier_env *env,  				parent->var_off.value, parent->off);  			return -EFAULT;  		} +		if (parent->live & REG_LIVE_READ) +			/* The parentage chain never changes and +			 * this parent was already marked as LIVE_READ. +			 * There is no need to keep walking the chain again and +			 * keep re-marking all parents as LIVE_READ. +			 * This case happens when the same register is read +			 * multiple times without writes into it in-between. +			 */ +			break;  		/* ... then we depend on parent's value */  		parent->live |= REG_LIVE_READ;  		state = parent;  		parent = state->parent;  		writes = true; +		cnt++;  	} + +	if (env->longest_mark_read_walk < cnt) +		env->longest_mark_read_walk = cnt;  	return 0;  } @@ -1167,30 +1178,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,  {  	struct bpf_verifier_state *vstate = env->cur_state;  	struct bpf_func_state *state = vstate->frame[vstate->curframe]; -	struct bpf_reg_state *regs = state->regs; +	struct bpf_reg_state *reg, *regs = state->regs;  	if (regno >= MAX_BPF_REG) {  		verbose(env, "R%d is invalid\n", regno);  		return -EINVAL;  	} +	reg = ®s[regno];  	if (t == SRC_OP) {  		/* check whether register used as source operand can be read */ -		if (regs[regno].type == NOT_INIT) { +		if (reg->type == NOT_INIT) {  			verbose(env, "R%d !read_ok\n", regno);  			return -EACCES;  		}  		/* We don't need to worry about FP liveness because it's read-only */ -		if (regno != BPF_REG_FP) -			return mark_reg_read(env, ®s[regno], -					     regs[regno].parent); +		if (regno == BPF_REG_FP) +			return 0; + +		return mark_reg_read(env, reg, reg->parent);  	} else {  		/* check whether register used as dest operand can be written to */  		if (regno == BPF_REG_FP) {  			verbose(env, "frame pointer is read only\n");  			return -EACCES;  		} -		regs[regno].live |= REG_LIVE_WRITTEN; +		reg->live |= REG_LIVE_WRITTEN;  		if (t == DST_OP)  			mark_reg_unknown(env, regs, regno);  	} @@ -1416,7 +1429,7 @@ static int check_stack_access(struct bpf_verifier_env *env,  		char tn_buf[48];  		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -		verbose(env, "variable stack access var_off=%s off=%d size=%d", +		verbose(env, "variable stack access var_off=%s off=%d size=%d\n",  			tn_buf, off, size);  		return -EACCES;  	} @@ -1429,6 +1442,28 @@ static int check_stack_access(struct bpf_verifier_env *env,  	return 0;  } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, +				 int off, int size, enum bpf_access_type type) +{ +	struct bpf_reg_state *regs = cur_regs(env); +	struct bpf_map *map = regs[regno].map_ptr; +	u32 cap = bpf_map_flags_to_cap(map); + +	if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { +		verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", +			map->value_size, off, size); +		return -EACCES; +	} + +	if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { +		verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", +			map->value_size, off, size); +		return -EACCES; +	} + +	return 0; +} +  /* check read/write into map element returned by bpf_map_lookup_elem() */  static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,  			      int size, bool zero_size_allowed) @@ -1458,7 +1493,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  	 * need to try adding each of min_value and max_value to off  	 * to make sure our theoretical access will be safe.  	 */ -	if (env->log.level) +	if (env->log.level & BPF_LOG_LEVEL)  		print_verifier_state(env, state);  	/* The minimum value is only important with signed @@ -1901,8 +1936,9 @@ continue_func:  		}  		frame++;  		if (frame >= MAX_CALL_FRAMES) { -			WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); -			return -EFAULT; +			verbose(env, "the call stack of %d frames is too deep !\n", +				frame); +			return -E2BIG;  		}  		goto process_func;  	} @@ -1958,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env,  	return 0;  } +static int check_tp_buffer_access(struct bpf_verifier_env *env, +				  const struct bpf_reg_state *reg, +				  int regno, int off, int size) +{ +	if (off < 0) { +		verbose(env, +			"R%d invalid tracepoint buffer access: off=%d, size=%d", +			regno, off, size); +		return -EACCES; +	} +	if (!tnum_is_const(reg->var_off) || reg->var_off.value) { +		char tn_buf[48]; + +		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +		verbose(env, +			"R%d invalid variable buffer offset: off=%d, var_off=%s", +			regno, off, tn_buf); +		return -EACCES; +	} +	if (off + size > env->prog->aux->max_tp_access) +		env->prog->aux->max_tp_access = off + size; + +	return 0; +} + +  /* truncate register to smaller size (in bytes)   * must be called with size < BPF_REG_SIZE   */ @@ -2014,7 +2076,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  			verbose(env, "R%d leaks addr into map\n", value_regno);  			return -EACCES;  		} - +		err = check_map_access_type(env, regno, off, size, t); +		if (err) +			return err;  		err = check_map_access(env, regno, off, size, false);  		if (!err && t == BPF_READ && value_regno >= 0)  			mark_reg_unknown(env, regs, value_regno); @@ -2100,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		err = check_sock_access(env, insn_idx, regno, off, size, t);  		if (!err && value_regno >= 0)  			mark_reg_unknown(env, regs, value_regno); +	} else if (reg->type == PTR_TO_TP_BUFFER) { +		err = check_tp_buffer_access(env, reg, regno, off, size); +		if (!err && t == BPF_READ && value_regno >= 0) +			mark_reg_unknown(env, regs, value_regno);  	} else {  		verbose(env, "R%d invalid mem access '%s'\n", regno,  			reg_type_str[reg->type]); @@ -2160,6 +2228,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  				BPF_SIZE(insn->code), BPF_WRITE, -1, true);  } +static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, +				  int off, int access_size, +				  bool zero_size_allowed) +{ +	struct bpf_reg_state *reg = reg_state(env, regno); + +	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || +	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) { +		if (tnum_is_const(reg->var_off)) { +			verbose(env, "invalid stack type R%d off=%d access_size=%d\n", +				regno, off, access_size); +		} else { +			char tn_buf[48]; + +			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +			verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", +				regno, tn_buf, access_size); +		} +		return -EACCES; +	} +	return 0; +} +  /* when register 'regno' is passed into function that will read 'access_size'   * bytes from that pointer, make sure that it's within stack boundary   * and all elements of stack are initialized. @@ -2172,7 +2263,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  {  	struct bpf_reg_state *reg = reg_state(env, regno);  	struct bpf_func_state *state = func(env, reg); -	int off, i, slot, spi; +	int err, min_off, max_off, i, slot, spi;  	if (reg->type != PTR_TO_STACK) {  		/* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2186,21 +2277,57 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  		return -EACCES;  	} -	/* Only allow fixed-offset stack reads */ -	if (!tnum_is_const(reg->var_off)) { -		char tn_buf[48]; +	if (tnum_is_const(reg->var_off)) { +		min_off = max_off = reg->var_off.value + reg->off; +		err = __check_stack_boundary(env, regno, min_off, access_size, +					     zero_size_allowed); +		if (err) +			return err; +	} else { +		/* Variable offset is prohibited for unprivileged mode for +		 * simplicity since it requires corresponding support in +		 * Spectre masking for stack ALU. +		 * See also retrieve_ptr_limit(). +		 */ +		if (!env->allow_ptr_leaks) { +			char tn_buf[48]; -		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -		verbose(env, "invalid variable stack read R%d var_off=%s\n", -			regno, tn_buf); -		return -EACCES; -	} -	off = reg->off + reg->var_off.value; -	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || -	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) { -		verbose(env, "invalid stack type R%d off=%d access_size=%d\n", -			regno, off, access_size); -		return -EACCES; +			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +			verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", +				regno, tn_buf); +			return -EACCES; +		} +		/* Only initialized buffer on stack is allowed to be accessed +		 * with variable offset. With uninitialized buffer it's hard to +		 * guarantee that whole memory is marked as initialized on +		 * helper return since specific bounds are unknown what may +		 * cause uninitialized stack leaking. +		 */ +		if (meta && meta->raw_mode) +			meta = NULL; + +		if (reg->smax_value >= BPF_MAX_VAR_OFF || +		    reg->smax_value <= -BPF_MAX_VAR_OFF) { +			verbose(env, "R%d unbounded indirect variable offset stack access\n", +				regno); +			return -EACCES; +		} +		min_off = reg->smin_value + reg->off; +		max_off = reg->smax_value + reg->off; +		err = __check_stack_boundary(env, regno, min_off, access_size, +					     zero_size_allowed); +		if (err) { +			verbose(env, "R%d min value is outside of stack bound\n", +				regno); +			return err; +		} +		err = __check_stack_boundary(env, regno, max_off, access_size, +					     zero_size_allowed); +		if (err) { +			verbose(env, "R%d max value is outside of stack bound\n", +				regno); +			return err; +		}  	}  	if (meta && meta->raw_mode) { @@ -2209,10 +2336,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  		return 0;  	} -	for (i = 0; i < access_size; i++) { +	for (i = min_off; i < max_off + access_size; i++) {  		u8 *stype; -		slot = -(off + i) - 1; +		slot = -i - 1;  		spi = slot / BPF_REG_SIZE;  		if (state->allocated_stack <= slot)  			goto err; @@ -2225,8 +2352,16 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  			goto mark;  		}  err: -		verbose(env, "invalid indirect read from stack off %d+%d size %d\n", -			off, i, access_size); +		if (tnum_is_const(reg->var_off)) { +			verbose(env, "invalid indirect read from stack off %d+%d size %d\n", +				min_off, i - min_off, access_size); +		} else { +			char tn_buf[48]; + +			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +			verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", +				tn_buf, i - min_off, access_size); +		}  		return -EACCES;  mark:  		/* reading any byte out of 8-byte 'spill_slot' will cause @@ -2235,7 +2370,7 @@ mark:  		mark_reg_read(env, &state->stack[spi].spilled_ptr,  			      state->stack[spi].spilled_ptr.parent);  	} -	return update_stack_depth(env, state, off); +	return update_stack_depth(env, state, min_off);  }  static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -2250,6 +2385,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,  		return check_packet_access(env, regno, reg->off, access_size,  					   zero_size_allowed);  	case PTR_TO_MAP_VALUE: +		if (check_map_access_type(env, regno, reg->off, access_size, +					  meta && meta->raw_mode ? BPF_WRITE : +					  BPF_READ)) +			return -EACCES;  		return check_map_access(env, regno, reg->off, access_size,  					zero_size_allowed);  	default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -2356,6 +2495,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)  	       type == ARG_CONST_SIZE_OR_ZERO;  } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ +	return type == ARG_PTR_TO_INT || +	       type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ +	if (type == ARG_PTR_TO_INT) +		return sizeof(u32); +	else if (type == ARG_PTR_TO_LONG) +		return sizeof(u64); + +	return -EINVAL; +} +  static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			  enum bpf_arg_type arg_type,  			  struct bpf_call_arg_meta *meta) @@ -2388,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  	if (arg_type == ARG_PTR_TO_MAP_KEY ||  	    arg_type == ARG_PTR_TO_MAP_VALUE || -	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { +	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || +	    arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {  		expected_type = PTR_TO_STACK; -		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && -		    type != expected_type) +		if (register_is_null(reg) && +		    arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) +			/* final test in check_stack_boundary() */; +		else if (!type_is_pkt_pointer(type) && +			 type != PTR_TO_MAP_VALUE && +			 type != expected_type)  			goto err_type;  	} else if (arg_type == ARG_CONST_SIZE ||  		   arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2414,16 +2574,19 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */  		if (!type_is_sk_pointer(type))  			goto err_type; +		if (reg->ref_obj_id) { +			if (meta->ref_obj_id) { +				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", +					regno, reg->ref_obj_id, +					meta->ref_obj_id); +				return -EFAULT; +			} +			meta->ref_obj_id = reg->ref_obj_id; +		}  	} else if (arg_type == ARG_PTR_TO_SOCKET) {  		expected_type = PTR_TO_SOCKET;  		if (type != expected_type)  			goto err_type; -		if (meta->ptr_id || !reg->id) { -			verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", -				meta->ptr_id, reg->id); -			return -EFAULT; -		} -		meta->ptr_id = reg->id;  	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {  		if (meta->func_id == BPF_FUNC_spin_lock) {  			if (process_spin_lock(env, regno, true)) @@ -2449,6 +2612,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			 type != expected_type)  			goto err_type;  		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; +	} else if (arg_type_is_int_ptr(arg_type)) { +		expected_type = PTR_TO_STACK; +		if (!type_is_pkt_pointer(type) && +		    type != PTR_TO_MAP_VALUE && +		    type != expected_type) +			goto err_type;  	} else {  		verbose(env, "unsupported arg_type %d\n", arg_type);  		return -EFAULT; @@ -2475,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  					      meta->map_ptr->key_size, false,  					      NULL);  	} else if (arg_type == ARG_PTR_TO_MAP_VALUE || +		   (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && +		    !register_is_null(reg)) ||  		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {  		/* bpf_map_xxx(..., map_ptr, ..., value) call:  		 * check [value, value + map->value_size) validity @@ -2530,6 +2701,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		err = check_helper_mem_access(env, regno - 1,  					      reg->umax_value,  					      zero_size_allowed, meta); +	} else if (arg_type_is_int_ptr(arg_type)) { +		int size = int_ptr_type_to_size(arg_type); + +		err = check_helper_mem_access(env, regno, size, false, meta); +		if (err) +			return err; +		err = check_ptr_alignment(env, reg, 0, size, true);  	}  	return err; @@ -2617,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		    func_id != BPF_FUNC_map_push_elem)  			goto error;  		break; +	case BPF_MAP_TYPE_SK_STORAGE: +		if (func_id != BPF_FUNC_sk_storage_get && +		    func_id != BPF_FUNC_sk_storage_delete) +			goto error; +		break;  	default:  		break;  	} @@ -2680,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		    map->map_type != BPF_MAP_TYPE_STACK)  			goto error;  		break; +	case BPF_FUNC_sk_storage_get: +	case BPF_FUNC_sk_storage_delete: +		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) +			goto error; +		break;  	default:  		break;  	} @@ -2740,32 +2928,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)  	return true;  } -static bool check_refcount_ok(const struct bpf_func_proto *fn) +static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)  {  	int count = 0; -	if (arg_type_is_refcounted(fn->arg1_type)) +	if (arg_type_may_be_refcounted(fn->arg1_type))  		count++; -	if (arg_type_is_refcounted(fn->arg2_type)) +	if (arg_type_may_be_refcounted(fn->arg2_type))  		count++; -	if (arg_type_is_refcounted(fn->arg3_type)) +	if (arg_type_may_be_refcounted(fn->arg3_type))  		count++; -	if (arg_type_is_refcounted(fn->arg4_type)) +	if (arg_type_may_be_refcounted(fn->arg4_type))  		count++; -	if (arg_type_is_refcounted(fn->arg5_type)) +	if (arg_type_may_be_refcounted(fn->arg5_type))  		count++; +	/* A reference acquiring function cannot acquire +	 * another refcounted ptr. +	 */ +	if (is_acquire_function(func_id) && count) +		return false; +  	/* We only support one arg being unreferenced at the moment,  	 * which is sufficient for the helper functions we have right now.  	 */  	return count <= 1;  } -static int check_func_proto(const struct bpf_func_proto *fn) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id)  {  	return check_raw_mode_ok(fn) &&  	       check_arg_pair_ok(fn) && -	       check_refcount_ok(fn) ? 0 : -EINVAL; +	       check_refcount_ok(fn, func_id) ? 0 : -EINVAL;  }  /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2799,19 +2993,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)  }  static void release_reg_references(struct bpf_verifier_env *env, -				   struct bpf_func_state *state, int id) +				   struct bpf_func_state *state, +				   int ref_obj_id)  {  	struct bpf_reg_state *regs = state->regs, *reg;  	int i;  	for (i = 0; i < MAX_BPF_REG; i++) -		if (regs[i].id == id) +		if (regs[i].ref_obj_id == ref_obj_id)  			mark_reg_unknown(env, regs, i);  	bpf_for_each_spilled_reg(i, state, reg) {  		if (!reg)  			continue; -		if (reg_is_refcounted(reg) && reg->id == id) +		if (reg->ref_obj_id == ref_obj_id)  			__mark_reg_unknown(reg);  	}  } @@ -2820,15 +3015,20 @@ static void release_reg_references(struct bpf_verifier_env *env,   * resources. Identify all copies of the same pointer and clear the reference.   */  static int release_reference(struct bpf_verifier_env *env, -			     struct bpf_call_arg_meta *meta) +			     int ref_obj_id)  {  	struct bpf_verifier_state *vstate = env->cur_state; +	int err;  	int i; +	err = release_reference_state(cur_func(env), ref_obj_id); +	if (err) +		return err; +  	for (i = 0; i <= vstate->curframe; i++) -		release_reg_references(env, vstate->frame[i], meta->ptr_id); +		release_reg_references(env, vstate->frame[i], ref_obj_id); -	return release_reference_state(cur_func(env), meta->ptr_id); +	return 0;  }  static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -2897,7 +3097,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	/* and go analyze first insn of the callee */  	*insn_idx = target_insn; -	if (env->log.level) { +	if (env->log.level & BPF_LOG_LEVEL) {  		verbose(env, "caller:\n");  		print_verifier_state(env, caller);  		verbose(env, "callee:\n"); @@ -2937,7 +3137,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)  		return err;  	*insn_idx = callee->callsite + 1; -	if (env->log.level) { +	if (env->log.level & BPF_LOG_LEVEL) {  		verbose(env, "returning from callee:\n");  		print_verifier_state(env, callee);  		verbose(env, "to caller at %d:\n", *insn_idx); @@ -2971,6 +3171,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,  		int func_id, int insn_idx)  {  	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; +	struct bpf_map *map = meta->map_ptr;  	if (func_id != BPF_FUNC_tail_call &&  	    func_id != BPF_FUNC_map_lookup_elem && @@ -2981,11 +3182,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,  	    func_id != BPF_FUNC_map_peek_elem)  		return 0; -	if (meta->map_ptr == NULL) { +	if (map == NULL) {  		verbose(env, "kernel subsystem misconfigured verifier\n");  		return -EINVAL;  	} +	/* In case of read-only, some additional restrictions +	 * need to be applied in order to prevent altering the +	 * state of the map from program side. +	 */ +	if ((map->map_flags & BPF_F_RDONLY_PROG) && +	    (func_id == BPF_FUNC_map_delete_elem || +	     func_id == BPF_FUNC_map_update_elem || +	     func_id == BPF_FUNC_map_push_elem || +	     func_id == BPF_FUNC_map_pop_elem)) { +		verbose(env, "write into map forbidden\n"); +		return -EACCES; +	} +  	if (!BPF_MAP_PTR(aux->map_state))  		bpf_map_ptr_store(aux, meta->map_ptr,  				  meta->map_ptr->unpriv_array); @@ -3047,7 +3261,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  	memset(&meta, 0, sizeof(meta));  	meta.pkt_access = fn->pkt_access; -	err = check_func_proto(fn); +	err = check_func_proto(fn, func_id);  	if (err) {  		verbose(env, "kernel subsystem misconfigured func %s#%d\n",  			func_id_name(func_id), func_id); @@ -3093,7 +3307,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  			return err;  		}  	} else if (is_release_function(func_id)) { -		err = release_reference(env, &meta); +		err = release_reference(env, meta.ref_obj_id);  		if (err) {  			verbose(env, "func %s#%d reference has not been acquired before\n",  				func_id_name(func_id), func_id); @@ -3149,17 +3363,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {  		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; -		if (is_acquire_function(func_id)) { -			int id = acquire_reference_state(env, insn_idx); - -			if (id < 0) -				return id; -			/* For release_reference() */ -			regs[BPF_REG_0].id = id; -		} else { -			/* For mark_ptr_or_null_reg() */ -			regs[BPF_REG_0].id = ++env->id_gen; -		} +		regs[BPF_REG_0].id = ++env->id_gen; +	} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { +		mark_reg_known_zero(env, regs, BPF_REG_0); +		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; +		regs[BPF_REG_0].id = ++env->id_gen;  	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {  		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3170,6 +3378,20 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  		return -EINVAL;  	} +	if (is_ptr_cast_function(func_id)) { +		/* For release_reference() */ +		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; +	} else if (is_acquire_function(func_id)) { +		int id = acquire_reference_state(env, insn_idx); + +		if (id < 0) +			return id; +		/* For mark_ptr_or_null_reg() */ +		regs[BPF_REG_0].id = id; +		/* For release_reference() */ +		regs[BPF_REG_0].ref_obj_id = id; +	} +  	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);  	err = check_map_func_compatibility(env, meta.map_ptr, func_id); @@ -3268,6 +3490,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,  	switch (ptr_reg->type) {  	case PTR_TO_STACK: +		/* Indirect variable offset stack access is prohibited in +		 * unprivileged mode so it's not handled here. +		 */  		off = ptr_reg->off + ptr_reg->var_off.value;  		if (mask_to_left)  			*ptr_limit = MAX_BPF_STACK + off; @@ -3368,7 +3593,7 @@ do_sim:  		*dst_reg = *ptr_reg;  	}  	ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); -	if (!ptr_is_dst_reg) +	if (!ptr_is_dst_reg && ret)  		*dst_reg = tmp;  	return !ret ? -EFAULT : 0;  } @@ -4124,15 +4349,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  	return 0;  } +static void __find_good_pkt_pointers(struct bpf_func_state *state, +				     struct bpf_reg_state *dst_reg, +				     enum bpf_reg_type type, u16 new_range) +{ +	struct bpf_reg_state *reg; +	int i; + +	for (i = 0; i < MAX_BPF_REG; i++) { +		reg = &state->regs[i]; +		if (reg->type == type && reg->id == dst_reg->id) +			/* keep the maximum range already checked */ +			reg->range = max(reg->range, new_range); +	} + +	bpf_for_each_spilled_reg(i, state, reg) { +		if (!reg) +			continue; +		if (reg->type == type && reg->id == dst_reg->id) +			reg->range = max(reg->range, new_range); +	} +} +  static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,  				   struct bpf_reg_state *dst_reg,  				   enum bpf_reg_type type,  				   bool range_right_open)  { -	struct bpf_func_state *state = vstate->frame[vstate->curframe]; -	struct bpf_reg_state *regs = state->regs, *reg;  	u16 new_range; -	int i, j; +	int i;  	if (dst_reg->off < 0 ||  	    (dst_reg->off == 0 && range_right_open)) @@ -4197,20 +4442,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,  	 * the range won't allow anything.  	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.  	 */ -	for (i = 0; i < MAX_BPF_REG; i++) -		if (regs[i].type == type && regs[i].id == dst_reg->id) -			/* keep the maximum range already checked */ -			regs[i].range = max(regs[i].range, new_range); - -	for (j = 0; j <= vstate->curframe; j++) { -		state = vstate->frame[j]; -		bpf_for_each_spilled_reg(i, state, reg) { -			if (!reg) -				continue; -			if (reg->type == type && reg->id == dst_reg->id) -				reg->range = max(reg->range, new_range); -		} -	} +	for (i = 0; i <= vstate->curframe; i++) +		__find_good_pkt_pointers(vstate->frame[i], dst_reg, type, +					 new_range);  }  /* compute branch direction of the expression "if (reg opcode val) goto target;" @@ -4665,17 +4899,41 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,  		} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {  			reg->type = PTR_TO_TCP_SOCK;  		} -		if (is_null || !(reg_is_refcounted(reg) || -				 reg_may_point_to_spin_lock(reg))) { -			/* We don't need id from this point onwards anymore, -			 * thus we should better reset it, so that state -			 * pruning has chances to take effect. +		if (is_null) { +			/* We don't need id and ref_obj_id from this point +			 * onwards anymore, thus we should better reset it, +			 * so that state pruning has chances to take effect. +			 */ +			reg->id = 0; +			reg->ref_obj_id = 0; +		} else if (!reg_may_point_to_spin_lock(reg)) { +			/* For not-NULL ptr, reg->ref_obj_id will be reset +			 * in release_reg_references(). +			 * +			 * reg->id is still used by spin_lock ptr. Other +			 * than spin_lock ptr type, reg->id can be reset.  			 */  			reg->id = 0;  		}  	}  } +static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, +				    bool is_null) +{ +	struct bpf_reg_state *reg; +	int i; + +	for (i = 0; i < MAX_BPF_REG; i++) +		mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); + +	bpf_for_each_spilled_reg(i, state, reg) { +		if (!reg) +			continue; +		mark_ptr_or_null_reg(state, reg, id, is_null); +	} +} +  /* The logic is similar to find_good_pkt_pointers(), both could eventually   * be folded together at some point.   */ @@ -4683,24 +4941,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,  				  bool is_null)  {  	struct bpf_func_state *state = vstate->frame[vstate->curframe]; -	struct bpf_reg_state *reg, *regs = state->regs; +	struct bpf_reg_state *regs = state->regs; +	u32 ref_obj_id = regs[regno].ref_obj_id;  	u32 id = regs[regno].id; -	int i, j; - -	if (reg_is_refcounted_or_null(®s[regno]) && is_null) -		release_reference_state(state, id); +	int i; -	for (i = 0; i < MAX_BPF_REG; i++) -		mark_ptr_or_null_reg(state, ®s[i], id, is_null); +	if (ref_obj_id && ref_obj_id == id && is_null) +		/* regs[regno] is in the " == NULL" branch. +		 * No one could have freed the reference state before +		 * doing the NULL check. +		 */ +		WARN_ON_ONCE(release_reference_state(state, id)); -	for (j = 0; j <= vstate->curframe; j++) { -		state = vstate->frame[j]; -		bpf_for_each_spilled_reg(i, state, reg) { -			if (!reg) -				continue; -			mark_ptr_or_null_reg(state, reg, id, is_null); -		} -	} +	for (i = 0; i <= vstate->curframe; i++) +		__mark_ptr_or_null_regs(vstate->frame[i], id, is_null);  }  static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -4939,23 +5193,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  			insn->dst_reg);  		return -EACCES;  	} -	if (env->log.level) +	if (env->log.level & BPF_LOG_LEVEL)  		print_verifier_state(env, this_branch->frame[this_branch->curframe]);  	return 0;  } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ -	u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - -	return (struct bpf_map *) (unsigned long) imm64; -} -  /* verify BPF_LD_IMM64 instruction */  static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)  { +	struct bpf_insn_aux_data *aux = cur_aux(env);  	struct bpf_reg_state *regs = cur_regs(env); +	struct bpf_map *map;  	int err;  	if (BPF_SIZE(insn->code) != BPF_DW) { @@ -4979,11 +5227,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)  		return 0;  	} -	/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ -	BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); +	map = env->used_maps[aux->map_index]; +	mark_reg_known_zero(env, regs, insn->dst_reg); +	regs[insn->dst_reg].map_ptr = map; + +	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { +		regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; +		regs[insn->dst_reg].off = aux->map_off; +		if (map_value_has_spin_lock(map)) +			regs[insn->dst_reg].id = ++env->id_gen; +	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { +		regs[insn->dst_reg].type = CONST_PTR_TO_MAP; +	} else { +		verbose(env, "bpf verifier is misconfigured\n"); +		return -EINVAL; +	} -	regs[insn->dst_reg].type = CONST_PTR_TO_MAP; -	regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);  	return 0;  } @@ -5107,6 +5366,7 @@ static int check_return_code(struct bpf_verifier_env *env)  	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:  	case BPF_PROG_TYPE_SOCK_OPS:  	case BPF_PROG_TYPE_CGROUP_DEVICE: +	case BPF_PROG_TYPE_CGROUP_SYSCTL:  		break;  	default:  		return 0; @@ -5177,10 +5437,6 @@ enum {  #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) -static int *insn_stack;	/* stack of insns to process */ -static int cur_stack;	/* current stack index */ -static int *insn_state; -  /* t, w, e - match pseudo-code above:   * t - index of current instruction   * w - next instruction @@ -5188,6 +5444,9 @@ static int *insn_state;   */  static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)  { +	int *insn_stack = env->cfg.insn_stack; +	int *insn_state = env->cfg.insn_state; +  	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))  		return 0; @@ -5208,9 +5467,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)  		/* tree-edge */  		insn_state[t] = DISCOVERED | e;  		insn_state[w] = DISCOVERED; -		if (cur_stack >= env->prog->len) +		if (env->cfg.cur_stack >= env->prog->len)  			return -E2BIG; -		insn_stack[cur_stack++] = w; +		insn_stack[env->cfg.cur_stack++] = w;  		return 1;  	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {  		verbose_linfo(env, t, "%d: ", t); @@ -5234,27 +5493,28 @@ static int check_cfg(struct bpf_verifier_env *env)  {  	struct bpf_insn *insns = env->prog->insnsi;  	int insn_cnt = env->prog->len; +	int *insn_stack, *insn_state;  	int ret = 0;  	int i, t; -	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); +	insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);  	if (!insn_state)  		return -ENOMEM; -	insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); +	insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);  	if (!insn_stack) { -		kfree(insn_state); +		kvfree(insn_state);  		return -ENOMEM;  	}  	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */  	insn_stack[0] = 0; /* 0 is the first instruction */ -	cur_stack = 1; +	env->cfg.cur_stack = 1;  peek_stack: -	if (cur_stack == 0) +	if (env->cfg.cur_stack == 0)  		goto check_state; -	t = insn_stack[cur_stack - 1]; +	t = insn_stack[env->cfg.cur_stack - 1];  	if (BPF_CLASS(insns[t].code) == BPF_JMP ||  	    BPF_CLASS(insns[t].code) == BPF_JMP32) { @@ -5323,7 +5583,7 @@ peek_stack:  mark_explored:  	insn_state[t] = EXPLORED; -	if (cur_stack-- <= 0) { +	if (env->cfg.cur_stack-- <= 0) {  		verbose(env, "pop stack internal bug\n");  		ret = -EFAULT;  		goto err_free; @@ -5341,8 +5601,9 @@ check_state:  	ret = 0; /* cfg looks good */  err_free: -	kfree(insn_state); -	kfree(insn_stack); +	kvfree(insn_state); +	kvfree(insn_stack); +	env->cfg.insn_state = env->cfg.insn_stack = NULL;  	return ret;  } @@ -6031,6 +6292,22 @@ static bool states_equal(struct bpf_verifier_env *env,  	return true;  } +static int propagate_liveness_reg(struct bpf_verifier_env *env, +				  struct bpf_reg_state *reg, +				  struct bpf_reg_state *parent_reg) +{ +	int err; + +	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) +		return 0; + +	err = mark_reg_read(env, reg, parent_reg); +	if (err) +		return err; + +	return 0; +} +  /* A write screens off any subsequent reads; but write marks come from the   * straight-line code between a state and its parent.  When we arrive at an   * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -6042,8 +6319,9 @@ static int propagate_liveness(struct bpf_verifier_env *env,  			      const struct bpf_verifier_state *vstate,  			      struct bpf_verifier_state *vparent)  { -	int i, frame, err = 0; +	struct bpf_reg_state *state_reg, *parent_reg;  	struct bpf_func_state *state, *parent; +	int i, frame, err = 0;  	if (vparent->curframe != vstate->curframe) {  		WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -6052,29 +6330,28 @@ static int propagate_liveness(struct bpf_verifier_env *env,  	}  	/* Propagate read liveness of registers... */  	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); -	/* We don't need to worry about FP liveness because it's read-only */ -	for (i = 0; i < BPF_REG_FP; i++) { -		if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) -			continue; -		if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { -			err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], -					    &vparent->frame[vstate->curframe]->regs[i]); +	for (frame = 0; frame <= vstate->curframe; frame++) { +		parent = vparent->frame[frame]; +		state = vstate->frame[frame]; +		parent_reg = parent->regs; +		state_reg = state->regs; +		/* We don't need to worry about FP liveness, it's read-only */ +		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { +			err = propagate_liveness_reg(env, &state_reg[i], +						     &parent_reg[i]);  			if (err)  				return err;  		} -	} -	/* ... and stack slots */ -	for (frame = 0; frame <= vstate->curframe; frame++) { -		state = vstate->frame[frame]; -		parent = vparent->frame[frame]; +		/* Propagate stack slots. */  		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&  			    i < parent->allocated_stack / BPF_REG_SIZE; i++) { -			if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) -				continue; -			if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) -				mark_reg_read(env, &state->stack[i].spilled_ptr, -					      &parent->stack[i].spilled_ptr); +			parent_reg = &parent->stack[i].spilled_ptr; +			state_reg = &state->stack[i].spilled_ptr; +			err = propagate_liveness_reg(env, state_reg, +						     parent_reg); +			if (err) +				return err;  		}  	}  	return err; @@ -6083,11 +6360,13 @@ static int propagate_liveness(struct bpf_verifier_env *env,  static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  {  	struct bpf_verifier_state_list *new_sl; -	struct bpf_verifier_state_list *sl; +	struct bpf_verifier_state_list *sl, **pprev;  	struct bpf_verifier_state *cur = env->cur_state, *new;  	int i, j, err, states_cnt = 0; -	sl = env->explored_states[insn_idx]; +	pprev = &env->explored_states[insn_idx]; +	sl = *pprev; +  	if (!sl)  		/* this 'insn_idx' instruction wasn't marked, so we will not  		 * be doing state search here @@ -6098,6 +6377,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	while (sl != STATE_LIST_MARK) {  		if (states_equal(env, &sl->state, cur)) { +			sl->hit_cnt++;  			/* reached equivalent register/stack state,  			 * prune the search.  			 * Registers read by the continuation are read by us. @@ -6113,10 +6393,40 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  				return err;  			return 1;  		} -		sl = sl->next;  		states_cnt++; +		sl->miss_cnt++; +		/* heuristic to determine whether this state is beneficial +		 * to keep checking from state equivalence point of view. +		 * Higher numbers increase max_states_per_insn and verification time, +		 * but do not meaningfully decrease insn_processed. +		 */ +		if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { +			/* the state is unlikely to be useful. Remove it to +			 * speed up verification +			 */ +			*pprev = sl->next; +			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { +				free_verifier_state(&sl->state, false); +				kfree(sl); +				env->peak_states--; +			} else { +				/* cannot free this state, since parentage chain may +				 * walk it later. Add it for free_list instead to +				 * be freed at the end of verification +				 */ +				sl->next = env->free_list; +				env->free_list = sl; +			} +			sl = *pprev; +			continue; +		} +		pprev = &sl->next; +		sl = *pprev;  	} +	if (env->max_states_per_insn < states_cnt) +		env->max_states_per_insn = states_cnt; +  	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)  		return 0; @@ -6130,6 +6440,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);  	if (!new_sl)  		return -ENOMEM; +	env->total_states++; +	env->peak_states++;  	/* add new state to the head of linked list */  	new = &new_sl->state; @@ -6214,8 +6526,7 @@ static int do_check(struct bpf_verifier_env *env)  	struct bpf_verifier_state *state;  	struct bpf_insn *insns = env->prog->insnsi;  	struct bpf_reg_state *regs; -	int insn_cnt = env->prog->len, i; -	int insn_processed = 0; +	int insn_cnt = env->prog->len;  	bool do_print_state = false;  	env->prev_linfo = NULL; @@ -6250,10 +6561,10 @@ static int do_check(struct bpf_verifier_env *env)  		insn = &insns[env->insn_idx];  		class = BPF_CLASS(insn->code); -		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { +		if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {  			verbose(env,  				"BPF program is too large. Processed %d insn\n", -				insn_processed); +				env->insn_processed);  			return -E2BIG;  		} @@ -6262,7 +6573,7 @@ static int do_check(struct bpf_verifier_env *env)  			return err;  		if (err == 1) {  			/* found equivalent state, can prune the search */ -			if (env->log.level) { +			if (env->log.level & BPF_LOG_LEVEL) {  				if (do_print_state)  					verbose(env, "\nfrom %d to %d%s: safe\n",  						env->prev_insn_idx, env->insn_idx, @@ -6280,8 +6591,9 @@ static int do_check(struct bpf_verifier_env *env)  		if (need_resched())  			cond_resched(); -		if (env->log.level > 1 || (env->log.level && do_print_state)) { -			if (env->log.level > 1) +		if (env->log.level & BPF_LOG_LEVEL2 || +		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) { +			if (env->log.level & BPF_LOG_LEVEL2)  				verbose(env, "%d:", env->insn_idx);  			else  				verbose(env, "\nfrom %d to %d%s:", @@ -6292,7 +6604,7 @@ static int do_check(struct bpf_verifier_env *env)  			do_print_state = false;  		} -		if (env->log.level) { +		if (env->log.level & BPF_LOG_LEVEL) {  			const struct bpf_insn_cbs cbs = {  				.cb_print	= verbose,  				.private_data	= env, @@ -6557,16 +6869,6 @@ process_bpf_exit:  		env->insn_idx++;  	} -	verbose(env, "processed %d insns (limit %d), stack depth ", -		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); -	for (i = 0; i < env->subprog_cnt; i++) { -		u32 depth = env->subprog_info[i].stack_depth; - -		verbose(env, "%d", depth); -		if (i + 1 < env->subprog_cnt) -			verbose(env, "+"); -	} -	verbose(env, "\n");  	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;  	return 0;  } @@ -6664,8 +6966,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  		}  		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { +			struct bpf_insn_aux_data *aux;  			struct bpf_map *map;  			struct fd f; +			u64 addr;  			if (i == insn_cnt - 1 || insn[1].code != 0 ||  			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -6674,13 +6978,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  				return -EINVAL;  			} -			if (insn->src_reg == 0) +			if (insn[0].src_reg == 0)  				/* valid generic load 64-bit imm */  				goto next_insn; -			if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || -			    insn[1].imm != 0) { -				verbose(env, "unrecognized bpf_ld_imm64 insn\n"); +			/* In final convert_pseudo_ld_imm64() step, this is +			 * converted into regular 64-bit imm load insn. +			 */ +			if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && +			     insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || +			    (insn[0].src_reg == BPF_PSEUDO_MAP_FD && +			     insn[1].imm != 0)) { +				verbose(env, +					"unrecognized bpf_ld_imm64 insn\n");  				return -EINVAL;  			} @@ -6698,16 +7008,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  				return err;  			} -			/* store map pointer inside BPF_LD_IMM64 instruction */ -			insn[0].imm = (u32) (unsigned long) map; -			insn[1].imm = ((u64) (unsigned long) map) >> 32; +			aux = &env->insn_aux_data[i]; +			if (insn->src_reg == BPF_PSEUDO_MAP_FD) { +				addr = (unsigned long)map; +			} else { +				u32 off = insn[1].imm; + +				if (off >= BPF_MAX_VAR_OFF) { +					verbose(env, "direct value offset of %u is not allowed\n", off); +					fdput(f); +					return -EINVAL; +				} + +				if (!map->ops->map_direct_value_addr) { +					verbose(env, "no direct value access support for this map type\n"); +					fdput(f); +					return -EINVAL; +				} + +				err = map->ops->map_direct_value_addr(map, &addr, off); +				if (err) { +					verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", +						map->value_size, off); +					fdput(f); +					return err; +				} + +				aux->map_off = off; +				addr += off; +			} + +			insn[0].imm = (u32)addr; +			insn[1].imm = addr >> 32;  			/* check whether we recorded this map already */ -			for (j = 0; j < env->used_map_cnt; j++) +			for (j = 0; j < env->used_map_cnt; j++) {  				if (env->used_maps[j] == map) { +					aux->map_index = j;  					fdput(f);  					goto next_insn;  				} +			}  			if (env->used_map_cnt >= MAX_USED_MAPS) {  				fdput(f); @@ -6724,6 +7065,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  				fdput(f);  				return PTR_ERR(map);  			} + +			aux->map_index = env->used_map_cnt;  			env->used_maps[env->used_map_cnt++] = map;  			if (bpf_map_is_cgroup_storage(map) && @@ -6829,8 +7172,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of  	struct bpf_prog *new_prog;  	new_prog = bpf_patch_insn_single(env->prog, off, patch, len); -	if (!new_prog) +	if (IS_ERR(new_prog)) { +		if (PTR_ERR(new_prog) == -ERANGE) +			verbose(env, +				"insn %d cannot be patched due to 16-bit range\n", +				env->insn_aux_data[off].orig_idx);  		return NULL; +	}  	if (adjust_insn_aux_data(env, new_prog->len, off, len))  		return NULL;  	adjust_subprog_starts(env, off, len); @@ -7251,7 +7599,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  									insn->dst_reg,  									shift);  				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, -								(1 << size * 8) - 1); +								(1ULL << size * 8) - 1);  			}  		} @@ -7368,9 +7716,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)  			    insn->src_reg != BPF_PSEUDO_CALL)  				continue;  			subprog = insn->off; -			insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) -				func[subprog]->bpf_func - -				__bpf_call_base; +			insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - +				    __bpf_call_base;  		}  		/* we use the aux data to keep a list of the start addresses @@ -7772,6 +8119,14 @@ static void free_states(struct bpf_verifier_env *env)  	struct bpf_verifier_state_list *sl, *sln;  	int i; +	sl = env->free_list; +	while (sl) { +		sln = sl->next; +		free_verifier_state(&sl->state, false); +		kfree(sl); +		sl = sln; +	} +  	if (!env->explored_states)  		return; @@ -7787,12 +8142,37 @@ static void free_states(struct bpf_verifier_env *env)  			}  	} -	kfree(env->explored_states); +	kvfree(env->explored_states); +} + +static void print_verification_stats(struct bpf_verifier_env *env) +{ +	int i; + +	if (env->log.level & BPF_LOG_STATS) { +		verbose(env, "verification time %lld usec\n", +			div_u64(env->verification_time, 1000)); +		verbose(env, "stack depth "); +		for (i = 0; i < env->subprog_cnt; i++) { +			u32 depth = env->subprog_info[i].stack_depth; + +			verbose(env, "%d", depth); +			if (i + 1 < env->subprog_cnt) +				verbose(env, "+"); +		} +		verbose(env, "\n"); +	} +	verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " +		"total_states %d peak_states %d mark_read %d\n", +		env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, +		env->max_states_per_insn, env->total_states, +		env->peak_states, env->longest_mark_read_walk);  }  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  	      union bpf_attr __user *uattr)  { +	u64 start_time = ktime_get_ns();  	struct bpf_verifier_env *env;  	struct bpf_verifier_log *log;  	int i, len, ret = -EINVAL; @@ -7820,9 +8200,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  		env->insn_aux_data[i].orig_idx = i;  	env->prog = *prog;  	env->ops = bpf_verifier_ops[env->prog->type]; +	is_priv = capable(CAP_SYS_ADMIN);  	/* grab the mutex to protect few globals used by verifier */ -	mutex_lock(&bpf_verifier_lock); +	if (!is_priv) +		mutex_lock(&bpf_verifier_lock);  	if (attr->log_level || attr->log_buf || attr->log_size) {  		/* user requested verbose verifier output @@ -7834,8 +8216,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  		ret = -EINVAL;  		/* log attributes have to be sane */ -		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || -		    !log->level || !log->ubuf) +		if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || +		    !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)  			goto err_unlock;  	} @@ -7845,7 +8227,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)  		env->strict_alignment = false; -	is_priv = capable(CAP_SYS_ADMIN);  	env->allow_ptr_leaks = is_priv;  	ret = replace_map_fd_with_map_ptr(env); @@ -7858,7 +8239,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,  			goto skip_full_check;  	} -	env->explored_states = kcalloc(env->prog->len, +	env->explored_states = kvcalloc(env->prog->len,  				       sizeof(struct bpf_verifier_state_list *),  				       GFP_USER);  	ret = -ENOMEM; @@ -7916,6 +8297,9 @@ skip_full_check:  	if (ret == 0)  		ret = fixup_call_args(env); +	env->verification_time = ktime_get_ns() - start_time; +	print_verification_stats(env); +  	if (log->level && bpf_verifier_log_full(log))  		ret = -ENOSPC;  	if (log->level && !log->ubuf) { @@ -7955,7 +8339,8 @@ err_release_maps:  		release_maps(env);  	*prog = env->prog;  err_unlock: -	mutex_unlock(&bpf_verifier_lock); +	if (!is_priv) +		mutex_unlock(&bpf_verifier_lock);  	vfree(env->insn_aux_data);  err_free_env:  	kfree(env); diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@  # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o  obj-$(CONFIG_CGROUP_PIDS) += pids.o  obj-$(CONFIG_CGROUP_RDMA) += rdma.o  obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 30e39f3932ad..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void);  #define TRACE_CGROUP_PATH(type, cgrp, ...)				\  	do {								\  		if (trace_cgroup_##type##_enabled()) {			\ -			spin_lock(&trace_cgroup_path_lock);		\ +			unsigned long flags;				\ +			spin_lock_irqsave(&trace_cgroup_path_lock,	\ +					  flags);			\  			cgroup_path(cgrp, trace_cgroup_path,		\  				    TRACE_CGROUP_PATH_LEN);		\  			trace_cgroup_##type(cgrp, trace_cgroup_path,	\  					    ##__VA_ARGS__);		\ -			spin_unlock(&trace_cgroup_path_lock);		\ +			spin_unlock_irqrestore(&trace_cgroup_path_lock, \ +					       flags);			\  		}							\  	} while (0) @@ -240,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn);  int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,  		     struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp);  int cgroup_task_count(const struct cgroup *cgrp);  /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c126b34fd4ff..88006be40ea3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #include "cgroup-internal.h"  #include <linux/ctype.h> @@ -342,22 +343,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,  	return l;  } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ -	int count = 0; -	struct cgrp_cset_link *link; - -	spin_lock_irq(&css_set_lock); -	list_for_each_entry(link, &cgrp->cset_links, cset_link) -		count += link->cset->nr_tasks; -	spin_unlock_irq(&css_set_lock); -	return count; -} -  /*   * Load a cgroup's pidarray with either procs' tgids or tasks' pids   */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3f2b4bde0f9c..426a0026225c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -593,6 +593,39 @@ static void cgroup_get_live(struct cgroup *cgrp)  	css_get(&cgrp->self);  } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ +	int count = 0; +	struct cgrp_cset_link *link; + +	lockdep_assert_held(&css_set_lock); + +	list_for_each_entry(link, &cgrp->cset_links, cset_link) +		count += link->cset->nr_tasks; + +	return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ +	int count; + +	spin_lock_irq(&css_set_lock); +	count = __cgroup_task_count(cgrp); +	spin_unlock_irq(&css_set_lock); + +	return count; +} +  struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)  {  	struct cgroup *cgrp = of->kn->parent->priv; @@ -783,6 +816,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)  			break;  		cgroup1_check_for_release(cgrp); +		TRACE_CGROUP_PATH(notify_populated, cgrp, +				  cgroup_is_populated(cgrp));  		cgroup_file_notify(&cgrp->events_file);  		child = cgrp; @@ -1775,11 +1810,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,  enum cgroup2_param {  	Opt_nsdelegate, +	Opt_memory_localevents,  	nr__cgroup2_params  };  static const struct fs_parameter_spec cgroup2_param_specs[] = { -	fsparam_flag  ("nsdelegate",		Opt_nsdelegate), +	fsparam_flag("nsdelegate",		Opt_nsdelegate), +	fsparam_flag("memory_localevents",	Opt_memory_localevents),  	{}  }; @@ -1802,6 +1839,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param  	case Opt_nsdelegate:  		ctx->flags |= CGRP_ROOT_NS_DELEGATE;  		return 0; +	case Opt_memory_localevents: +		ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; +		return 0;  	}  	return -EINVAL;  } @@ -1813,6 +1853,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)  			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;  		else  			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + +		if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) +			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; +		else +			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;  	}  } @@ -1820,6 +1865,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root  {  	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)  		seq_puts(seq, ",nsdelegate"); +	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) +		seq_puts(seq, ",memory_localevents");  	return 0;  } @@ -2402,8 +2449,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)  			get_css_set(to_cset);  			to_cset->nr_tasks++;  			css_set_move_task(task, from_cset, to_cset, true); -			put_css_set_locked(from_cset);  			from_cset->nr_tasks--; +			/* +			 * If the source or destination cgroup is frozen, +			 * the task might require to change its state. +			 */ +			cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, +						    to_cset->dfl_cgrp); +			put_css_set_locked(from_cset); +  		}  	}  	spin_unlock_irq(&css_set_lock); @@ -2602,7 +2656,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)  		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);  		if (!dst_cset) -			goto err; +			return -ENOMEM;  		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2634,9 +2688,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)  	}  	return 0; -err: -	cgroup_migrate_finish(mgctx); -	return -ENOMEM;  }  /** @@ -3447,8 +3498,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,  static int cgroup_events_show(struct seq_file *seq, void *v)  { -	seq_printf(seq, "populated %d\n", -		   cgroup_is_populated(seq_css(seq)->cgroup)); +	struct cgroup *cgrp = seq_css(seq)->cgroup; + +	seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); +	seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); +  	return 0;  } @@ -3498,17 +3552,118 @@ static int cpu_stat_show(struct seq_file *seq, void *v)  #ifdef CONFIG_PSI  static int cgroup_io_pressure_show(struct seq_file *seq, void *v)  { -	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +	struct cgroup *cgroup = seq_css(seq)->cgroup; +	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + +	return psi_show(seq, psi, PSI_IO);  }  static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)  { -	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +	struct cgroup *cgroup = seq_css(seq)->cgroup; +	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + +	return psi_show(seq, psi, PSI_MEM);  }  static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)  { -	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +	struct cgroup *cgroup = seq_css(seq)->cgroup; +	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + +	return psi_show(seq, psi, PSI_CPU); +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, +					  size_t nbytes, enum psi_res res) +{ +	struct psi_trigger *new; +	struct cgroup *cgrp; + +	cgrp = cgroup_kn_lock_live(of->kn, false); +	if (!cgrp) +		return -ENODEV; + +	cgroup_get(cgrp); +	cgroup_kn_unlock(of->kn); + +	new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); +	if (IS_ERR(new)) { +		cgroup_put(cgrp); +		return PTR_ERR(new); +	} + +	psi_trigger_replace(&of->priv, new); + +	cgroup_put(cgrp); + +	return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, +					  char *buf, size_t nbytes, +					  loff_t off) +{ +	return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, +					  char *buf, size_t nbytes, +					  loff_t off) +{ +	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, +					  char *buf, size_t nbytes, +					  loff_t off) +{ +	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, +					  poll_table *pt) +{ +	return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ +	psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ + +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ +	struct cgroup *cgrp = seq_css(seq)->cgroup; + +	seq_printf(seq, "%d\n", cgrp->freezer.freeze); + +	return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, +				   char *buf, size_t nbytes, loff_t off) +{ +	struct cgroup *cgrp; +	ssize_t ret; +	int freeze; + +	ret = kstrtoint(strstrip(buf), 0, &freeze); +	if (ret) +		return ret; + +	if (freeze < 0 || freeze > 1) +		return -ERANGE; + +	cgrp = cgroup_kn_lock_live(of->kn, false); +	if (!cgrp) +		return -ENOENT; + +	cgroup_freeze(cgrp, freeze); + +	cgroup_kn_unlock(of->kn); + +	return nbytes;  } -#endif  static int cgroup_file_open(struct kernfs_open_file *of)  { @@ -4654,6 +4809,12 @@ static struct cftype cgroup_base_files[] = {  		.seq_show = cgroup_stat_show,  	},  	{ +		.name = "cgroup.freeze", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = cgroup_freeze_show, +		.write = cgroup_freeze_write, +	}, +	{  		.name = "cpu.stat",  		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = cpu_stat_show, @@ -4661,20 +4822,26 @@ static struct cftype cgroup_base_files[] = {  #ifdef CONFIG_PSI  	{  		.name = "io.pressure", -		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = cgroup_io_pressure_show, +		.write = cgroup_io_pressure_write, +		.poll = cgroup_pressure_poll, +		.release = cgroup_pressure_release,  	},  	{  		.name = "memory.pressure", -		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = cgroup_memory_pressure_show, +		.write = cgroup_memory_pressure_write, +		.poll = cgroup_pressure_poll, +		.release = cgroup_pressure_release,  	},  	{  		.name = "cpu.pressure", -		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = cgroup_cpu_pressure_show, +		.write = cgroup_cpu_pressure_write, +		.poll = cgroup_pressure_poll, +		.release = cgroup_pressure_release,  	}, -#endif +#endif /* CONFIG_PSI */  	{ }	/* terminate */  }; @@ -4781,9 +4948,11 @@ static void css_release_work_fn(struct work_struct *work)  		if (cgroup_on_dfl(cgrp))  			cgroup_rstat_flush(cgrp); +		spin_lock_irq(&css_set_lock);  		for (tcgrp = cgroup_parent(cgrp); tcgrp;  		     tcgrp = cgroup_parent(tcgrp))  			tcgrp->nr_dying_descendants--; +		spin_unlock_irq(&css_set_lock);  		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);  		cgrp->id = -1; @@ -5001,12 +5170,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent)  	if (ret)  		goto out_psi_free; +	/* +	 * New cgroup inherits effective freeze counter, and +	 * if the parent has to be frozen, the child has too. +	 */ +	cgrp->freezer.e_freeze = parent->freezer.e_freeze; +	if (cgrp->freezer.e_freeze) +		set_bit(CGRP_FROZEN, &cgrp->flags); + +	spin_lock_irq(&css_set_lock);  	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {  		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; -		if (tcgrp != cgrp) +		if (tcgrp != cgrp) {  			tcgrp->nr_descendants++; + +			/* +			 * If the new cgroup is frozen, all ancestor cgroups +			 * get a new frozen descendant, but their state can't +			 * change because of this. +			 */ +			if (cgrp->freezer.e_freeze) +				tcgrp->freezer.nr_frozen_descendants++; +		}  	} +	spin_unlock_irq(&css_set_lock);  	if (notify_on_release(parent))  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5291,10 +5479,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	if (parent && cgroup_is_threaded(cgrp))  		parent->nr_threaded_children--; +	spin_lock_irq(&css_set_lock);  	for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {  		tcgrp->nr_descendants--;  		tcgrp->nr_dying_descendants++; +		/* +		 * If the dying cgroup is frozen, decrease frozen descendants +		 * counters of ancestor cgroups. +		 */ +		if (test_bit(CGRP_FROZEN, &cgrp->flags)) +			tcgrp->freezer.nr_frozen_descendants--;  	} +	spin_unlock_irq(&css_set_lock);  	cgroup1_check_for_release(parent); @@ -5746,6 +5942,26 @@ void cgroup_post_fork(struct task_struct *child)  			cset->nr_tasks++;  			css_set_move_task(child, NULL, cset, false);  		} + +		/* +		 * If the cgroup has to be frozen, the new task has too. +		 * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get +		 * the task into the frozen state. +		 */ +		if (unlikely(cgroup_task_freeze(child))) { +			spin_lock(&child->sighand->siglock); +			WARN_ON_ONCE(child->frozen); +			child->jobctl |= JOBCTL_TRAP_FREEZE; +			spin_unlock(&child->sighand->siglock); + +			/* +			 * Calling cgroup_update_frozen() isn't required here, +			 * because it will be called anyway a bit later +			 * from do_freezer_trap(). So we avoid cgroup's +			 * transient switch from the frozen state and back. +			 */ +		} +  		spin_unlock_irq(&css_set_lock);  	} @@ -5794,6 +6010,11 @@ void cgroup_exit(struct task_struct *tsk)  		spin_lock_irq(&css_set_lock);  		css_set_move_task(tsk, cset, NULL, false);  		cset->nr_tasks--; + +		WARN_ON_ONCE(cgroup_task_frozen(tsk)); +		if (unlikely(cgroup_task_freeze(tsk))) +			cgroup_update_frozen(task_dfl_cgroup(tsk)); +  		spin_unlock_irq(&css_set_lock);  	} else {  		get_css_set(cset); @@ -6116,7 +6337,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);  static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,  			     char *buf)  { -	return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); +	return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");  }  static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4834c4214e9c..6a1942ed781c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -740,11 +740,10 @@ static inline int nr_cpusets(void)   * Must be called with cpuset_mutex held.   *   * The three key local variables below are: - *    q  - a linked-list queue of cpuset pointers, used to implement a - *	   top-down scan of all cpusets.  This scan loads a pointer - *	   to each cpuset marked is_sched_load_balance into the - *	   array 'csa'.  For our purposes, rebuilding the schedulers - *	   sched domains, we can ignore !is_sched_load_balance cpusets. + *    cp - cpuset pointer, used (together with pos_css) to perform a + *	   top-down scan of all cpusets. For our purposes, rebuilding + *	   the schedulers sched domains, we can ignore !is_sched_load_ + *	   balance cpusets.   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets   *	   that need to be load balanced, for convenient iterative   *	   access by the subsequent code that finds the best partition, @@ -775,7 +774,7 @@ static inline int nr_cpusets(void)  static int generate_sched_domains(cpumask_var_t **domains,  			struct sched_domain_attr **attributes)  { -	struct cpuset *cp;	/* scans q */ +	struct cpuset *cp;	/* top-down scan of cpusets */  	struct cpuset **csa;	/* array of all cpuset ptrs */  	int csn;		/* how many cpuset ptrs in csa so far */  	int i, j, k;		/* indices for partition finding loops */ diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v)  		css = cset->subsys[ss->id];  		if (!css)  			continue; -		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, -			  (unsigned long)css, css->id); +		seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, +			  css, css->id);  	}  	rcu_read_unlock();  	spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)  		if (css->parent)  			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",  				 css->parent->id); -		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, -			  (unsigned long)css, css->id, +		seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, +			  css, css->id,  			  atomic_read(&css->online_cnt), pbuf);  	} diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 08236798d173..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,481 +1,314 @@ -/* - * cgroup_freezer.c -  control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater <clg@fr.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include <linux/export.h> -#include <linux/slab.h> +//SPDX-License-Identifier: GPL-2.0  #include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/uaccess.h> -#include <linux/freezer.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -/* - * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { -	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */ -	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */ -	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */ -	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */ +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> -	/* mask for all FREEZING flags */ -	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; +#include "cgroup-internal.h" -struct freezer { -	struct cgroup_subsys_state	css; -	unsigned int			state; -}; +#include <trace/events/cgroup.h> -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)  { -	return css ? container_of(css, struct freezer, css) : NULL; -} +	int desc = 1; -static inline struct freezer *task_freezer(struct task_struct *task) -{ -	return css_freezer(task_css(task, freezer_cgrp_id)); +	/* +	 * If the new state is frozen, some freezing ancestor cgroups may change +	 * their state too, depending on if all their descendants are frozen. +	 * +	 * Otherwise, all ancestor cgroups are forced into the non-frozen state. +	 */ +	while ((cgrp = cgroup_parent(cgrp))) { +		if (frozen) { +			cgrp->freezer.nr_frozen_descendants += desc; +			if (!test_bit(CGRP_FROZEN, &cgrp->flags) && +			    test_bit(CGRP_FREEZE, &cgrp->flags) && +			    cgrp->freezer.nr_frozen_descendants == +			    cgrp->nr_descendants) { +				set_bit(CGRP_FROZEN, &cgrp->flags); +				cgroup_file_notify(&cgrp->events_file); +				TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); +				desc++; +			} +		} else { +			cgrp->freezer.nr_frozen_descendants -= desc; +			if (test_bit(CGRP_FROZEN, &cgrp->flags)) { +				clear_bit(CGRP_FROZEN, &cgrp->flags); +				cgroup_file_notify(&cgrp->events_file); +				TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); +				desc++; +			} +		} +	}  } -static struct freezer *parent_freezer(struct freezer *freezer) +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp)  { -	return css_freezer(freezer->css.parent); -} +	bool frozen; -bool cgroup_freezing(struct task_struct *task) -{ -	bool ret; +	lockdep_assert_held(&css_set_lock); -	rcu_read_lock(); -	ret = task_freezer(task)->state & CGROUP_FREEZING; -	rcu_read_unlock(); +	/* +	 * If the cgroup has to be frozen (CGRP_FREEZE bit set), +	 * and all tasks are frozen and/or stopped, let's consider +	 * the cgroup frozen. Otherwise it's not frozen. +	 */ +	frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && +		cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); -	return ret; -} +	if (frozen) { +		/* Already there? */ +		if (test_bit(CGRP_FROZEN, &cgrp->flags)) +			return; -static const char *freezer_state_strs(unsigned int state) -{ -	if (state & CGROUP_FROZEN) -		return "FROZEN"; -	if (state & CGROUP_FREEZING) -		return "FREEZING"; -	return "THAWED"; -}; - -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ -	struct freezer *freezer; +		set_bit(CGRP_FROZEN, &cgrp->flags); +	} else { +		/* Already there? */ +		if (!test_bit(CGRP_FROZEN, &cgrp->flags)) +			return; -	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); -	if (!freezer) -		return ERR_PTR(-ENOMEM); +		clear_bit(CGRP_FROZEN, &cgrp->flags); +	} +	cgroup_file_notify(&cgrp->events_file); +	TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); -	return &freezer->css; +	/* Update the state of ancestor cgroups. */ +	cgroup_propagate_frozen(cgrp, frozen);  } -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css.  Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. +/* + * Increment cgroup's nr_frozen_tasks.   */ -static int freezer_css_online(struct cgroup_subsys_state *css) +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)  { -	struct freezer *freezer = css_freezer(css); -	struct freezer *parent = parent_freezer(freezer); - -	mutex_lock(&freezer_mutex); - -	freezer->state |= CGROUP_FREEZER_ONLINE; - -	if (parent && (parent->state & CGROUP_FREEZING)) { -		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; -		atomic_inc(&system_freezing_cnt); -	} - -	mutex_unlock(&freezer_mutex); -	return 0; +	cgrp->freezer.nr_frozen_tasks++;  } -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away.  Mark it dead and decrement system_freezing_count if - * it was holding one. +/* + * Decrement cgroup's nr_frozen_tasks.   */ -static void freezer_css_offline(struct cgroup_subsys_state *css) +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)  { -	struct freezer *freezer = css_freezer(css); - -	mutex_lock(&freezer_mutex); - -	if (freezer->state & CGROUP_FREEZING) -		atomic_dec(&system_freezing_cnt); - -	freezer->state = 0; - -	mutex_unlock(&freezer_mutex); +	cgrp->freezer.nr_frozen_tasks--; +	WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);  } -static void freezer_css_free(struct cgroup_subsys_state *css) +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void)  { -	kfree(css_freezer(css)); +	struct cgroup *cgrp; + +	if (current->frozen) +		return; + +	spin_lock_irq(&css_set_lock); +	current->frozen = true; +	cgrp = task_dfl_cgroup(current); +	cgroup_inc_frozen_cnt(cgrp); +	cgroup_update_frozen(cgrp); +	spin_unlock_irq(&css_set_lock);  }  /* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state.  freezer_attach() is responsible for making new tasks - * conform to the current state. + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary.   * - * Freezer state changes and task migration are synchronized via - * @freezer->lock.  freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state.   */ -static void freezer_attach(struct cgroup_taskset *tset) +void cgroup_leave_frozen(bool always_leave)  { -	struct task_struct *task; -	struct cgroup_subsys_state *new_css; - -	mutex_lock(&freezer_mutex); - -	/* -	 * Make the new tasks conform to the current state of @new_css. -	 * For simplicity, when migrating any task to a FROZEN cgroup, we -	 * revert it to FREEZING and let update_if_frozen() determine the -	 * correct state later. -	 * -	 * Tasks in @tset are on @new_css but may not conform to its -	 * current state before executing the following - !frozen tasks may -	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. -	 */ -	cgroup_taskset_for_each(task, new_css, tset) { -		struct freezer *freezer = css_freezer(new_css); - -		if (!(freezer->state & CGROUP_FREEZING)) { -			__thaw_task(task); -		} else { -			freeze_task(task); -			/* clear FROZEN and propagate upwards */ -			while (freezer && (freezer->state & CGROUP_FROZEN)) { -				freezer->state &= ~CGROUP_FROZEN; -				freezer = parent_freezer(freezer); -			} -		} +	struct cgroup *cgrp; + +	spin_lock_irq(&css_set_lock); +	cgrp = task_dfl_cgroup(current); +	if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { +		cgroup_dec_frozen_cnt(cgrp); +		cgroup_update_frozen(cgrp); +		WARN_ON_ONCE(!current->frozen); +		current->frozen = false; +	} else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { +		spin_lock(¤t->sighand->siglock); +		current->jobctl |= JOBCTL_TRAP_FREEZE; +		set_thread_flag(TIF_SIGPENDING); +		spin_unlock(¤t->sighand->siglock);  	} - -	mutex_unlock(&freezer_mutex); +	spin_unlock_irq(&css_set_lock);  } -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to.  This function may race against - * freezer_attach().  Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit.   */ -static void freezer_fork(struct task_struct *task) +static void cgroup_freeze_task(struct task_struct *task, bool freeze)  { -	struct freezer *freezer; +	unsigned long flags; -	/* -	 * The root cgroup is non-freezable, so we can skip locking the -	 * freezer.  This is safe regardless of race with task migration. -	 * If we didn't race or won, skipping is obviously the right thing -	 * to do.  If we lost and root is the new cgroup, noop is still the -	 * right thing to do. -	 */ -	if (task_css_is_root(task, freezer_cgrp_id)) +	/* If the task is about to die, don't bother with freezing it. */ +	if (!lock_task_sighand(task, &flags))  		return; -	mutex_lock(&freezer_mutex); -	rcu_read_lock(); - -	freezer = task_freezer(task); -	if (freezer->state & CGROUP_FREEZING) -		freeze_task(task); +	if (freeze) { +		task->jobctl |= JOBCTL_TRAP_FREEZE; +		signal_wake_up(task, false); +	} else { +		task->jobctl &= ~JOBCTL_TRAP_FREEZE; +		wake_up_process(task); +	} -	rcu_read_unlock(); -	mutex_unlock(&freezer_mutex); +	unlock_task_sighand(task, &flags);  } -/** - * update_if_frozen - update whether a cgroup finished freezing - * @css: css of interest - * - * Once FREEZING is initiated, transition to FROZEN is lazily updated by - * calling this function.  If the current state is FREEZING but not FROZEN, - * this function checks whether all tasks of this cgroup and the descendant - * cgroups finished freezing and, if so, sets FROZEN. - * - * The caller is responsible for grabbing RCU read lock and calling - * update_if_frozen() on all descendants prior to invoking this function. - * - * Task states and freezer state might disagree while tasks are being - * migrated into or out of @css, so we can't verify task states against - * @freezer state here.  See freezer_attach() for details. +/* + * Freeze or unfreeze all tasks in the given cgroup.   */ -static void update_if_frozen(struct cgroup_subsys_state *css) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)  { -	struct freezer *freezer = css_freezer(css); -	struct cgroup_subsys_state *pos;  	struct css_task_iter it;  	struct task_struct *task; -	lockdep_assert_held(&freezer_mutex); - -	if (!(freezer->state & CGROUP_FREEZING) || -	    (freezer->state & CGROUP_FROZEN)) -		return; +	lockdep_assert_held(&cgroup_mutex); -	/* are all (live) children frozen? */ -	rcu_read_lock(); -	css_for_each_child(pos, css) { -		struct freezer *child = css_freezer(pos); - -		if ((child->state & CGROUP_FREEZER_ONLINE) && -		    !(child->state & CGROUP_FROZEN)) { -			rcu_read_unlock(); -			return; -		} -	} -	rcu_read_unlock(); +	spin_lock_irq(&css_set_lock); +	if (freeze) +		set_bit(CGRP_FREEZE, &cgrp->flags); +	else +		clear_bit(CGRP_FREEZE, &cgrp->flags); +	spin_unlock_irq(&css_set_lock); -	/* are all tasks frozen? */ -	css_task_iter_start(css, 0, &it); +	if (freeze) +		TRACE_CGROUP_PATH(freeze, cgrp); +	else +		TRACE_CGROUP_PATH(unfreeze, cgrp); +	css_task_iter_start(&cgrp->self, 0, &it);  	while ((task = css_task_iter_next(&it))) { -		if (freezing(task)) { -			/* -			 * freezer_should_skip() indicates that the task -			 * should be skipped when determining freezing -			 * completion.  Consider it frozen in addition to -			 * the usual frozen condition. -			 */ -			if (!frozen(task) && !freezer_should_skip(task)) -				goto out_iter_end; -		} -	} - -	freezer->state |= CGROUP_FROZEN; -out_iter_end: -	css_task_iter_end(&it); -} - -static int freezer_read(struct seq_file *m, void *v) -{ -	struct cgroup_subsys_state *css = seq_css(m), *pos; - -	mutex_lock(&freezer_mutex); -	rcu_read_lock(); - -	/* update states bottom-up */ -	css_for_each_descendant_post(pos, css) { -		if (!css_tryget_online(pos)) +		/* +		 * Ignore kernel threads here. Freezing cgroups containing +		 * kthreads isn't supported. +		 */ +		if (task->flags & PF_KTHREAD)  			continue; -		rcu_read_unlock(); - -		update_if_frozen(pos); - -		rcu_read_lock(); -		css_put(pos); +		cgroup_freeze_task(task, freeze);  	} - -	rcu_read_unlock(); -	mutex_unlock(&freezer_mutex); - -	seq_puts(m, freezer_state_strs(css_freezer(css)->state)); -	seq_putc(m, '\n'); -	return 0; -} - -static void freeze_cgroup(struct freezer *freezer) -{ -	struct css_task_iter it; -	struct task_struct *task; - -	css_task_iter_start(&freezer->css, 0, &it); -	while ((task = css_task_iter_next(&it))) -		freeze_task(task);  	css_task_iter_end(&it); -} -static void unfreeze_cgroup(struct freezer *freezer) -{ -	struct css_task_iter it; -	struct task_struct *task; - -	css_task_iter_start(&freezer->css, 0, &it); -	while ((task = css_task_iter_next(&it))) -		__thaw_task(task); -	css_task_iter_end(&it); +	/* +	 * Cgroup state should be revisited here to cover empty leaf cgroups +	 * and cgroups which descendants are already in the desired state. +	 */ +	spin_lock_irq(&css_set_lock); +	if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) +		cgroup_update_frozen(cgrp); +	spin_unlock_irq(&css_set_lock);  } -/** - * freezer_apply_state - apply state change to a single cgroup_freezer - * @freezer: freezer to apply state change to - * @freeze: whether to freeze or unfreeze - * @state: CGROUP_FREEZING_* flag to set or clear - * - * Set or clear @state on @cgroup according to @freeze, and perform - * freezing or thawing as necessary. +/* + * Adjust the task state (freeze or unfreeze) and revisit the state of + * source and destination cgroups.   */ -static void freezer_apply_state(struct freezer *freezer, bool freeze, -				unsigned int state) +void cgroup_freezer_migrate_task(struct task_struct *task, +				 struct cgroup *src, struct cgroup *dst)  { -	/* also synchronizes against task migration, see freezer_attach() */ -	lockdep_assert_held(&freezer_mutex); +	lockdep_assert_held(&css_set_lock); -	if (!(freezer->state & CGROUP_FREEZER_ONLINE)) +	/* +	 * Kernel threads are not supposed to be frozen at all. +	 */ +	if (task->flags & PF_KTHREAD)  		return; -	if (freeze) { -		if (!(freezer->state & CGROUP_FREEZING)) -			atomic_inc(&system_freezing_cnt); -		freezer->state |= state; -		freeze_cgroup(freezer); -	} else { -		bool was_freezing = freezer->state & CGROUP_FREEZING; - -		freezer->state &= ~state; - -		if (!(freezer->state & CGROUP_FREEZING)) { -			if (was_freezing) -				atomic_dec(&system_freezing_cnt); -			freezer->state &= ~CGROUP_FROZEN; -			unfreeze_cgroup(freezer); -		} +	/* +	 * Adjust counters of freezing and frozen tasks. +	 * Note, that if the task is frozen, but the destination cgroup is not +	 * frozen, we bump both counters to keep them balanced. +	 */ +	if (task->frozen) { +		cgroup_inc_frozen_cnt(dst); +		cgroup_dec_frozen_cnt(src);  	} -} - -/** - * freezer_change_state - change the freezing state of a cgroup_freezer - * @freezer: freezer of interest - * @freeze: whether to freeze or thaw - * - * Freeze or thaw @freezer according to @freeze.  The operations are - * recursive - all descendants of @freezer will be affected. - */ -static void freezer_change_state(struct freezer *freezer, bool freeze) -{ -	struct cgroup_subsys_state *pos; +	cgroup_update_frozen(dst); +	cgroup_update_frozen(src);  	/* -	 * Update all its descendants in pre-order traversal.  Each -	 * descendant will try to inherit its parent's FREEZING state as -	 * CGROUP_FREEZING_PARENT. +	 * Force the task to the desired state.  	 */ -	mutex_lock(&freezer_mutex); -	rcu_read_lock(); -	css_for_each_descendant_pre(pos, &freezer->css) { -		struct freezer *pos_f = css_freezer(pos); -		struct freezer *parent = parent_freezer(pos_f); - -		if (!css_tryget_online(pos)) -			continue; -		rcu_read_unlock(); - -		if (pos_f == freezer) -			freezer_apply_state(pos_f, freeze, -					    CGROUP_FREEZING_SELF); -		else -			freezer_apply_state(pos_f, -					    parent->state & CGROUP_FREEZING, -					    CGROUP_FREEZING_PARENT); - -		rcu_read_lock(); -		css_put(pos); -	} -	rcu_read_unlock(); -	mutex_unlock(&freezer_mutex); +	cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));  } -static ssize_t freezer_write(struct kernfs_open_file *of, -			     char *buf, size_t nbytes, loff_t off) +void cgroup_freeze(struct cgroup *cgrp, bool freeze)  { -	bool freeze; +	struct cgroup_subsys_state *css; +	struct cgroup *dsct; +	bool applied = false; -	buf = strstrip(buf); +	lockdep_assert_held(&cgroup_mutex); -	if (strcmp(buf, freezer_state_strs(0)) == 0) -		freeze = false; -	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) -		freeze = true; -	else -		return -EINVAL; +	/* +	 * Nothing changed? Just exit. +	 */ +	if (cgrp->freezer.freeze == freeze) +		return; -	freezer_change_state(css_freezer(of_css(of)), freeze); -	return nbytes; -} +	cgrp->freezer.freeze = freeze; -static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, -				      struct cftype *cft) -{ -	struct freezer *freezer = css_freezer(css); +	/* +	 * Propagate changes downwards the cgroup tree. +	 */ +	css_for_each_descendant_pre(css, &cgrp->self) { +		dsct = css->cgroup; -	return (bool)(freezer->state & CGROUP_FREEZING_SELF); -} +		if (cgroup_is_dead(dsct)) +			continue; -static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, -					struct cftype *cft) -{ -	struct freezer *freezer = css_freezer(css); +		if (freeze) { +			dsct->freezer.e_freeze++; +			/* +			 * Already frozen because of ancestor's settings? +			 */ +			if (dsct->freezer.e_freeze > 1) +				continue; +		} else { +			dsct->freezer.e_freeze--; +			/* +			 * Still frozen because of ancestor's settings? +			 */ +			if (dsct->freezer.e_freeze > 0) +				continue; -	return (bool)(freezer->state & CGROUP_FREEZING_PARENT); -} +			WARN_ON_ONCE(dsct->freezer.e_freeze < 0); +		} + +		/* +		 * Do change actual state: freeze or unfreeze. +		 */ +		cgroup_do_freeze(dsct, freeze); +		applied = true; +	} -static struct cftype files[] = { -	{ -		.name = "state", -		.flags = CFTYPE_NOT_ON_ROOT, -		.seq_show = freezer_read, -		.write = freezer_write, -	}, -	{ -		.name = "self_freezing", -		.flags = CFTYPE_NOT_ON_ROOT, -		.read_u64 = freezer_self_freezing_read, -	}, -	{ -		.name = "parent_freezing", -		.flags = CFTYPE_NOT_ON_ROOT, -		.read_u64 = freezer_parent_freezing_read, -	}, -	{ }	/* terminate */ -}; - -struct cgroup_subsys freezer_cgrp_subsys = { -	.css_alloc	= freezer_css_alloc, -	.css_online	= freezer_css_online, -	.css_offline	= freezer_css_offline, -	.css_free	= freezer_css_free, -	.attach		= freezer_attach, -	.fork		= freezer_fork, -	.legacy_cftypes	= files, -}; +	/* +	 * Even if the actual state hasn't changed, let's notify a user. +	 * The state can be enforced by an ancestor cgroup: the cgroup +	 * can already be in the desired state or it can be locked in the +	 * opposite state, so that the transition will never happen. +	 * In both cases it's better to notify a user, that there is +	 * nothing to wait for. +	 */ +	if (!applied) { +		TRACE_CGROUP_PATH(notify_frozen, cgrp, +				  test_bit(CGRP_FROZEN, &cgrp->flags)); +		cgroup_file_notify(&cgrp->events_file); +	} +} diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c new file mode 100644 index 000000000000..08236798d173 --- /dev/null +++ b/kernel/cgroup/legacy_freezer.c @@ -0,0 +1,481 @@ +/* + * cgroup_freezer.c -  control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> + +/* + * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { +	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */ +	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */ +	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */ +	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */ + +	/* mask for all FREEZING flags */ +	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, +}; + +struct freezer { +	struct cgroup_subsys_state	css; +	unsigned int			state; +}; + +static DEFINE_MUTEX(freezer_mutex); + +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ +	return css ? container_of(css, struct freezer, css) : NULL; +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ +	return css_freezer(task_css(task, freezer_cgrp_id)); +} + +static struct freezer *parent_freezer(struct freezer *freezer) +{ +	return css_freezer(freezer->css.parent); +} + +bool cgroup_freezing(struct task_struct *task) +{ +	bool ret; + +	rcu_read_lock(); +	ret = task_freezer(task)->state & CGROUP_FREEZING; +	rcu_read_unlock(); + +	return ret; +} + +static const char *freezer_state_strs(unsigned int state) +{ +	if (state & CGROUP_FROZEN) +		return "FROZEN"; +	if (state & CGROUP_FREEZING) +		return "FREEZING"; +	return "THAWED"; +}; + +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) +{ +	struct freezer *freezer; + +	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); +	if (!freezer) +		return ERR_PTR(-ENOMEM); + +	return &freezer->css; +} + +/** + * freezer_css_online - commit creation of a freezer css + * @css: css being created + * + * We're committing to creation of @css.  Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup_subsys_state *css) +{ +	struct freezer *freezer = css_freezer(css); +	struct freezer *parent = parent_freezer(freezer); + +	mutex_lock(&freezer_mutex); + +	freezer->state |= CGROUP_FREEZER_ONLINE; + +	if (parent && (parent->state & CGROUP_FREEZING)) { +		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; +		atomic_inc(&system_freezing_cnt); +	} + +	mutex_unlock(&freezer_mutex); +	return 0; +} + +/** + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed + * + * @css is going away.  Mark it dead and decrement system_freezing_count if + * it was holding one. + */ +static void freezer_css_offline(struct cgroup_subsys_state *css) +{ +	struct freezer *freezer = css_freezer(css); + +	mutex_lock(&freezer_mutex); + +	if (freezer->state & CGROUP_FREEZING) +		atomic_dec(&system_freezing_cnt); + +	freezer->state = 0; + +	mutex_unlock(&freezer_mutex); +} + +static void freezer_css_free(struct cgroup_subsys_state *css) +{ +	kfree(css_freezer(css)); +} + +/* + * Tasks can be migrated into a different freezer anytime regardless of its + * current state.  freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock.  freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. + */ +static void freezer_attach(struct cgroup_taskset *tset) +{ +	struct task_struct *task; +	struct cgroup_subsys_state *new_css; + +	mutex_lock(&freezer_mutex); + +	/* +	 * Make the new tasks conform to the current state of @new_css. +	 * For simplicity, when migrating any task to a FROZEN cgroup, we +	 * revert it to FREEZING and let update_if_frozen() determine the +	 * correct state later. +	 * +	 * Tasks in @tset are on @new_css but may not conform to its +	 * current state before executing the following - !frozen tasks may +	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. +	 */ +	cgroup_taskset_for_each(task, new_css, tset) { +		struct freezer *freezer = css_freezer(new_css); + +		if (!(freezer->state & CGROUP_FREEZING)) { +			__thaw_task(task); +		} else { +			freeze_task(task); +			/* clear FROZEN and propagate upwards */ +			while (freezer && (freezer->state & CGROUP_FROZEN)) { +				freezer->state &= ~CGROUP_FROZEN; +				freezer = parent_freezer(freezer); +			} +		} +	} + +	mutex_unlock(&freezer_mutex); +} + +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to.  This function may race against + * freezer_attach().  Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ +static void freezer_fork(struct task_struct *task) +{ +	struct freezer *freezer; + +	/* +	 * The root cgroup is non-freezable, so we can skip locking the +	 * freezer.  This is safe regardless of race with task migration. +	 * If we didn't race or won, skipping is obviously the right thing +	 * to do.  If we lost and root is the new cgroup, noop is still the +	 * right thing to do. +	 */ +	if (task_css_is_root(task, freezer_cgrp_id)) +		return; + +	mutex_lock(&freezer_mutex); +	rcu_read_lock(); + +	freezer = task_freezer(task); +	if (freezer->state & CGROUP_FREEZING) +		freeze_task(task); + +	rcu_read_unlock(); +	mutex_unlock(&freezer_mutex); +} + +/** + * update_if_frozen - update whether a cgroup finished freezing + * @css: css of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function.  If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @css, so we can't verify task states against + * @freezer state here.  See freezer_attach() for details. + */ +static void update_if_frozen(struct cgroup_subsys_state *css) +{ +	struct freezer *freezer = css_freezer(css); +	struct cgroup_subsys_state *pos; +	struct css_task_iter it; +	struct task_struct *task; + +	lockdep_assert_held(&freezer_mutex); + +	if (!(freezer->state & CGROUP_FREEZING) || +	    (freezer->state & CGROUP_FROZEN)) +		return; + +	/* are all (live) children frozen? */ +	rcu_read_lock(); +	css_for_each_child(pos, css) { +		struct freezer *child = css_freezer(pos); + +		if ((child->state & CGROUP_FREEZER_ONLINE) && +		    !(child->state & CGROUP_FROZEN)) { +			rcu_read_unlock(); +			return; +		} +	} +	rcu_read_unlock(); + +	/* are all tasks frozen? */ +	css_task_iter_start(css, 0, &it); + +	while ((task = css_task_iter_next(&it))) { +		if (freezing(task)) { +			/* +			 * freezer_should_skip() indicates that the task +			 * should be skipped when determining freezing +			 * completion.  Consider it frozen in addition to +			 * the usual frozen condition. +			 */ +			if (!frozen(task) && !freezer_should_skip(task)) +				goto out_iter_end; +		} +	} + +	freezer->state |= CGROUP_FROZEN; +out_iter_end: +	css_task_iter_end(&it); +} + +static int freezer_read(struct seq_file *m, void *v) +{ +	struct cgroup_subsys_state *css = seq_css(m), *pos; + +	mutex_lock(&freezer_mutex); +	rcu_read_lock(); + +	/* update states bottom-up */ +	css_for_each_descendant_post(pos, css) { +		if (!css_tryget_online(pos)) +			continue; +		rcu_read_unlock(); + +		update_if_frozen(pos); + +		rcu_read_lock(); +		css_put(pos); +	} + +	rcu_read_unlock(); +	mutex_unlock(&freezer_mutex); + +	seq_puts(m, freezer_state_strs(css_freezer(css)->state)); +	seq_putc(m, '\n'); +	return 0; +} + +static void freeze_cgroup(struct freezer *freezer) +{ +	struct css_task_iter it; +	struct task_struct *task; + +	css_task_iter_start(&freezer->css, 0, &it); +	while ((task = css_task_iter_next(&it))) +		freeze_task(task); +	css_task_iter_end(&it); +} + +static void unfreeze_cgroup(struct freezer *freezer) +{ +	struct css_task_iter it; +	struct task_struct *task; + +	css_task_iter_start(&freezer->css, 0, &it); +	while ((task = css_task_iter_next(&it))) +		__thaw_task(task); +	css_task_iter_end(&it); +} + +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, +				unsigned int state) +{ +	/* also synchronizes against task migration, see freezer_attach() */ +	lockdep_assert_held(&freezer_mutex); + +	if (!(freezer->state & CGROUP_FREEZER_ONLINE)) +		return; + +	if (freeze) { +		if (!(freezer->state & CGROUP_FREEZING)) +			atomic_inc(&system_freezing_cnt); +		freezer->state |= state; +		freeze_cgroup(freezer); +	} else { +		bool was_freezing = freezer->state & CGROUP_FREEZING; + +		freezer->state &= ~state; + +		if (!(freezer->state & CGROUP_FREEZING)) { +			if (was_freezing) +				atomic_dec(&system_freezing_cnt); +			freezer->state &= ~CGROUP_FROZEN; +			unfreeze_cgroup(freezer); +		} +	} +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze.  The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ +	struct cgroup_subsys_state *pos; + +	/* +	 * Update all its descendants in pre-order traversal.  Each +	 * descendant will try to inherit its parent's FREEZING state as +	 * CGROUP_FREEZING_PARENT. +	 */ +	mutex_lock(&freezer_mutex); +	rcu_read_lock(); +	css_for_each_descendant_pre(pos, &freezer->css) { +		struct freezer *pos_f = css_freezer(pos); +		struct freezer *parent = parent_freezer(pos_f); + +		if (!css_tryget_online(pos)) +			continue; +		rcu_read_unlock(); + +		if (pos_f == freezer) +			freezer_apply_state(pos_f, freeze, +					    CGROUP_FREEZING_SELF); +		else +			freezer_apply_state(pos_f, +					    parent->state & CGROUP_FREEZING, +					    CGROUP_FREEZING_PARENT); + +		rcu_read_lock(); +		css_put(pos); +	} +	rcu_read_unlock(); +	mutex_unlock(&freezer_mutex); +} + +static ssize_t freezer_write(struct kernfs_open_file *of, +			     char *buf, size_t nbytes, loff_t off) +{ +	bool freeze; + +	buf = strstrip(buf); + +	if (strcmp(buf, freezer_state_strs(0)) == 0) +		freeze = false; +	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) +		freeze = true; +	else +		return -EINVAL; + +	freezer_change_state(css_freezer(of_css(of)), freeze); +	return nbytes; +} + +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, +				      struct cftype *cft) +{ +	struct freezer *freezer = css_freezer(css); + +	return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, +					struct cftype *cft) +{ +	struct freezer *freezer = css_freezer(css); + +	return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + +static struct cftype files[] = { +	{ +		.name = "state", +		.flags = CFTYPE_NOT_ON_ROOT, +		.seq_show = freezer_read, +		.write = freezer_write, +	}, +	{ +		.name = "self_freezing", +		.flags = CFTYPE_NOT_ON_ROOT, +		.read_u64 = freezer_self_freezing_read, +	}, +	{ +		.name = "parent_freezing", +		.flags = CFTYPE_NOT_ON_ROOT, +		.read_u64 = freezer_parent_freezing_read, +	}, +	{ }	/* terminate */ +}; + +struct cgroup_subsys freezer_cgrp_subsys = { +	.css_alloc	= freezer_css_alloc, +	.css_online	= freezer_css_online, +	.css_offline	= freezer_css_offline, +	.css_free	= freezer_css_free, +	.attach		= freezer_attach, +	.fork		= freezer_fork, +	.legacy_cftypes	= files, +}; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bb95a35e8c2d..ca19b4c8acf5 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #include "cgroup-internal.h"  #include <linux/sched/cputime.h> diff --git a/kernel/compat.c b/kernel/compat.c index d8a36c6ad7c9..b5f7063c0db6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -346,8 +346,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)  		return -EFAULT;  	switch (_NSIG_WORDS) {  	case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); +		/* fall through */  	case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); +		/* fall through */  	case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); +		/* fall through */  	case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );  	}  #else diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9ad37b9e44a7..be01a4d627c9 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Context tracking: Probe on high level context boundaries such as kernel   * and userspace. This includes syscalls and exceptions entry/exit. diff --git a/kernel/cpu.c b/kernel/cpu.c index 025f419d16f6..f2ef10460698 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -9,6 +9,7 @@  #include <linux/notifier.h>  #include <linux/sched/signal.h>  #include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h>  #include <linux/sched/task.h>  #include <linux/sched/smt.h>  #include <linux/unistd.h> @@ -564,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)  		cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);  } +static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) +{ +	if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) +		return true; +	/* +	 * When CPU hotplug is disabled, then taking the CPU down is not +	 * possible because takedown_cpu() and the architecture and +	 * subsystem specific mechanisms are not available. So the CPU +	 * which would be completely unplugged again needs to stay around +	 * in the current state. +	 */ +	return st->state <= CPUHP_BRINGUP_CPU; +} +  static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,  			      enum cpuhp_state target)  { @@ -574,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,  		st->state++;  		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);  		if (ret) { -			st->target = prev_state; -			undo_cpu_up(cpu, st); +			if (can_rollback_cpu(st)) { +				st->target = prev_state; +				undo_cpu_up(cpu, st); +			}  			break;  		}  	} @@ -844,6 +861,8 @@ static int take_cpu_down(void *_param)  	/* Give up timekeeping duties */  	tick_handover_do_timer(); +	/* Remove CPU from timer broadcasting */ +	tick_offline_cpu(cpu);  	/* Park the stopper thread */  	stop_machine_park(cpu);  	return 0; @@ -1183,8 +1202,15 @@ int freeze_secondary_cpus(int primary)  	int cpu, error = 0;  	cpu_maps_update_begin(); -	if (!cpu_online(primary)) +	if (primary == -1) {  		primary = cpumask_first(cpu_online_mask); +		if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) +			primary = housekeeping_any_cpu(HK_FLAG_TIMER); +	} else { +		if (!cpu_online(primary)) +			primary = cpumask_first(cpu_online_mask); +	} +  	/*  	 * We take down all of the non-boot CPUs in one shot to avoid races  	 * with the userspace trying to use the CPU hotplug at the same time @@ -2017,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {  #ifdef CONFIG_HOTPLUG_SMT -static const char *smt_states[] = { -	[CPU_SMT_ENABLED]		= "on", -	[CPU_SMT_DISABLED]		= "off", -	[CPU_SMT_FORCE_DISABLED]	= "forceoff", -	[CPU_SMT_NOT_SUPPORTED]		= "notsupported", -}; - -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) -{ -	return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); -} -  static void cpuhp_offline_cpu_device(unsigned int cpu)  {  	struct device *dev = get_cpu_device(cpu); @@ -2100,9 +2113,10 @@ static int cpuhp_smt_enable(void)  	return ret;  } +  static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, -		  const char *buf, size_t count) +__store_smt_control(struct device *dev, struct device_attribute *attr, +		    const char *buf, size_t count)  {  	int ctrlval, ret; @@ -2140,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,  	unlock_device_hotplug();  	return ret ? ret : count;  } + +#else /* !CONFIG_HOTPLUG_SMT */ +static ssize_t +__store_smt_control(struct device *dev, struct device_attribute *attr, +		    const char *buf, size_t count) +{ +	return -ENODEV; +} +#endif /* CONFIG_HOTPLUG_SMT */ + +static const char *smt_states[] = { +	[CPU_SMT_ENABLED]		= "on", +	[CPU_SMT_DISABLED]		= "off", +	[CPU_SMT_FORCE_DISABLED]	= "forceoff", +	[CPU_SMT_NOT_SUPPORTED]		= "notsupported", +	[CPU_SMT_NOT_IMPLEMENTED]	= "notimplemented", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ +	const char *state = smt_states[cpu_smt_control]; + +	return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, +		  const char *buf, size_t count) +{ +	return __store_smt_control(dev, attr, buf, count); +}  static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);  static ssize_t  show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)  { -	bool active = topology_max_smt_threads() > 1; - -	return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); +	return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());  }  static DEVICE_ATTR(active, 0444, show_smt_active, NULL); @@ -2163,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {  	NULL  }; -static int __init cpu_smt_state_init(void) +static int __init cpu_smt_sysfs_init(void)  {  	return sysfs_create_group(&cpu_subsys.dev_root->kobj,  				  &cpuhp_smt_attr_group);  } -#else -static inline int cpu_smt_state_init(void) { return 0; } -#endif -  static int __init cpuhp_sysfs_init(void)  {  	int cpu, ret; -	ret = cpu_smt_state_init(); +	ret = cpu_smt_sysfs_init();  	if (ret)  		return ret; @@ -2198,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void)  	return 0;  }  device_initcall(cpuhp_sysfs_init); -#endif +#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */  /*   * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -2288,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void)  #endif  	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);  } + +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; + +static int __init mitigations_parse_cmdline(char *arg) +{ +	if (!strcmp(arg, "off")) +		cpu_mitigations = CPU_MITIGATIONS_OFF; +	else if (!strcmp(arg, "auto")) +		cpu_mitigations = CPU_MITIGATIONS_AUTO; +	else if (!strcmp(arg, "auto,nosmt")) +		cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + +	return 0; +} +early_param("mitigations", mitigations_parse_cmdline); diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index b64e238b553b..9c23ae074b40 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #include <linux/kernel.h>  #include <linux/crash_dump.h>  #include <linux/init.h> diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..e74ffdc98a92 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* Task credentials management - see Documentation/security/credentials.rst   *   * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.   * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version.   */  #include <linux/export.h>  #include <linux/cred.h> diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile index a85edc339985..332ee6c6ec2c 100644 --- a/kernel/debug/Makefile +++ b/kernel/debug/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # Makefile for the linux kernel debugger  # diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 7510dc687c0d..4b280fc7dd67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks)  				return DBG_PASS_EVENT;  			}  #endif +			/* Fall through */  		case 'C': /* Exception passing */  			tmp = gdb_cmd_exception_pass(ks);  			if (tmp > 0)  				goto default_handle;  			if (tmp == 0)  				break; -			/* Fall through on tmp < 0 */ +			/* Fall through - on tmp < 0 */  		case 'c': /* Continue packet */  		case 's': /* Single step packet */  			if (kgdb_contthread && kgdb_contthread != current) { @@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks)  				break;  			}  			dbg_activate_sw_breakpoints(); -			/* Fall through to default processing */ +			/* Fall through - to default processing */  		default:  default_handle:  			error = kgdb_arch_handle_exception(ks->ex_vector, @@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)  		return error;  	case 's':  	case 'c': -		strcpy(remcom_in_buffer, cmd); +		strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));  		return 0;  	case '$': -		strcpy(remcom_in_buffer, cmd); +		strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));  		gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);  		gdbstub_prev_in_buf_pos = 0;  		return 0; diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile index d4fc58f4b88d..efac857c5511 100644 --- a/kernel/debug/kdb/Makefile +++ b/kernel/debug/kdb/Makefile @@ -6,7 +6,6 @@  # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.  # -CCVERSION	:= $(shell $(CC) -v 2>&1 | sed -ne '$$p')  obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o  obj-$(CONFIG_KDB_KEYBOARD)    += kdb_keyboard.o diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a4b41484afe..3a5184eb6977 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -446,7 +446,7 @@ poll_again:  char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)  {  	if (prompt && kdb_prompt_str != prompt) -		strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); +		strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);  	kdb_printf(kdb_prompt_str);  	kdb_nextline = 1;	/* Prompt and input resets line number */  	return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 82a3b32a7cfc..9ecfa37c7fbf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv)  	kdb_printf("machine    %s\n", init_uts_ns.name.machine);  	kdb_printf("nodename   %s\n", init_uts_ns.name.nodename);  	kdb_printf("domainname %s\n", init_uts_ns.name.domainname); -	kdb_printf("ccversion  %s\n", __stringify(CCVERSION));  	now = __ktime_get_real_seconds();  	time64_to_tm(now, 0, &tm); @@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv)  		diag = kdbgetularg(argv[3], &whichcpu);  		if (diag)  			return diag; -		if (!cpu_online(whichcpu)) { +		if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {  			kdb_printf("cpu %ld is not online\n", whichcpu);  			return KDB_BADCPUNUM;  		} diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 50bf9b119bad..b8e6306e7e13 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)  	while ((name = kdb_walk_kallsyms(&pos))) {  		if (strncmp(name, prefix_name, prefix_len) == 0) { -			strcpy(ks_namebuf, name); +			strscpy(ks_namebuf, name, sizeof(ks_namebuf));  			/* Work out the longest name that matches the prefix */  			if (++number == 1) {  				prev_len = min_t(int, max_len-1, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 2a12b988c717..27725754ac99 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* delayacct.c - per-task delay accounting   *   * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * - * This program is free software;  you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details.   */  #include <linux/sched.h> diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index a06ba3013b3b..70f8f8d9200e 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  config HAS_DMA  	bool @@ -38,6 +39,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU  config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL  	bool +config ARCH_HAS_DMA_PREP_COHERENT +	bool +  config ARCH_HAS_DMA_COHERENT_TO_PFN  	bool @@ -57,6 +61,7 @@ config SWIOTLB  config DMA_REMAP  	depends on MMU +	select GENERIC_ALLOCATOR  	bool  config DMA_DIRECT_REMAP diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 45d51e8e26f6..badd77670d00 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -89,8 +89,8 @@ struct dma_debug_entry {  	int		 sg_mapped_ents;  	enum map_err_types  map_err_type;  #ifdef CONFIG_STACKTRACE -	struct		 stack_trace stacktrace; -	unsigned long	 st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; +	unsigned int	stack_len; +	unsigned long	stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];  #endif  }; @@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)  #ifdef CONFIG_STACKTRACE  	if (entry) {  		pr_warning("Mapped at:\n"); -		print_stack_trace(&entry->stacktrace, 0); +		stack_trace_print(entry->stack_entries, entry->stack_len, 0);  	}  #endif  } @@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)  	spin_unlock_irqrestore(&free_entries_lock, flags);  #ifdef CONFIG_STACKTRACE -	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; -	entry->stacktrace.entries = entry->st_entries; -	entry->stacktrace.skip = 2; -	save_stack_trace(&entry->stacktrace); +	entry->stack_len = stack_trace_save(entry->stack_entries, +					    ARRAY_SIZE(entry->stack_entries), +					    1);  #endif -  	return entry;  } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index fcdb23e8d2fc..2c2772e9702a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -311,7 +311,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,  		size_t size)  {  	return swiotlb_force != SWIOTLB_FORCE && -		(!dev || dma_capable(dev, dma_addr, size)); +		dma_capable(dev, dma_addr, size);  }  dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c000906348c9..f7afdadb6770 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -238,17 +238,13 @@ u64 dma_get_required_mask(struct device *dev)  }  EXPORT_SYMBOL_GPL(dma_get_required_mask); -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev)	(true) -#endif -  void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,  		gfp_t flag, unsigned long attrs)  {  	const struct dma_map_ops *ops = get_dma_ops(dev);  	void *cpu_addr; -	WARN_ON_ONCE(dev && !dev->coherent_dma_mask); +	WARN_ON_ONCE(!dev->coherent_dma_mask);  	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))  		return cpu_addr; @@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,  	/* let the implementation decide on the zone to allocate from: */  	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); -	if (!arch_dma_alloc_attrs(&dev)) -		return NULL; -  	if (dma_is_direct(ops))  		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);  	else if (ops->alloc) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 53012db1e53c..13f0cb080a4d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Dynamic DMA mapping support.   * @@ -452,6 +453,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,  	unsigned long mask;  	unsigned long offset_slots;  	unsigned long max_slots; +	unsigned long tmp_io_tlb_used;  	if (no_iotlb_memory)  		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -538,9 +540,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,  	} while (index != wrap);  not_found: +	tmp_io_tlb_used = io_tlb_used; +  	spin_unlock_irqrestore(&io_tlb_lock, flags);  	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) -		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); +		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", +			 size, io_tlb_nslabs, tmp_io_tlb_used);  	return DMA_MAPPING_ERROR;  found:  	io_tlb_used += nslots; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1032a16bd186..abbd4b3b96c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2009,8 +2009,8 @@ event_sched_out(struct perf_event *event,  	event->pmu->del(event, 0);  	event->oncpu = -1; -	if (event->pending_disable) { -		event->pending_disable = 0; +	if (READ_ONCE(event->pending_disable) >= 0) { +		WRITE_ONCE(event->pending_disable, -1);  		state = PERF_EVENT_STATE_OFF;  	}  	perf_event_set_state(event, state); @@ -2198,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);  void perf_event_disable_inatomic(struct perf_event *event)  { -	event->pending_disable = 1; +	WRITE_ONCE(event->pending_disable, smp_processor_id()); +	/* can fail, see perf_pending_event_disable() */  	irq_work_queue(&event->pending);  } @@ -2477,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,  	perf_pmu_enable(cpuctx->ctx.pmu);  } +void perf_pmu_resched(struct pmu *pmu) +{ +	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); +	struct perf_event_context *task_ctx = cpuctx->task_ctx; + +	perf_ctx_lock(cpuctx, task_ctx); +	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); +	perf_ctx_unlock(cpuctx, task_ctx); +} +  /*   * Cross CPU call to install and enable a performance event   * @@ -5810,10 +5821,45 @@ void perf_event_wakeup(struct perf_event *event)  	}  } +static void perf_pending_event_disable(struct perf_event *event) +{ +	int cpu = READ_ONCE(event->pending_disable); + +	if (cpu < 0) +		return; + +	if (cpu == smp_processor_id()) { +		WRITE_ONCE(event->pending_disable, -1); +		perf_event_disable_local(event); +		return; +	} + +	/* +	 *  CPU-A			CPU-B +	 * +	 *  perf_event_disable_inatomic() +	 *    @pending_disable = CPU-A; +	 *    irq_work_queue(); +	 * +	 *  sched-out +	 *    @pending_disable = -1; +	 * +	 *				sched-in +	 *				perf_event_disable_inatomic() +	 *				  @pending_disable = CPU-B; +	 *				  irq_work_queue(); // FAILS +	 * +	 *  irq_work_run() +	 *    perf_pending_event() +	 * +	 * But the event runs on CPU-B and wants disabling there. +	 */ +	irq_work_queue_on(&event->pending, cpu); +} +  static void perf_pending_event(struct irq_work *entry)  { -	struct perf_event *event = container_of(entry, -			struct perf_event, pending); +	struct perf_event *event = container_of(entry, struct perf_event, pending);  	int rctx;  	rctx = perf_swevent_get_recursion_context(); @@ -5822,10 +5868,7 @@ static void perf_pending_event(struct irq_work *entry)  	 * and we won't recurse 'further'.  	 */ -	if (event->pending_disable) { -		event->pending_disable = 0; -		perf_event_disable_local(event); -	} +	perf_pending_event_disable(event);  	if (event->pending_wakeup) {  		event->pending_wakeup = 0; @@ -7189,6 +7232,7 @@ static void perf_event_mmap_output(struct perf_event *event,  	struct perf_output_handle handle;  	struct perf_sample_data sample;  	int size = mmap_event->event_id.header.size; +	u32 type = mmap_event->event_id.header.type;  	int ret;  	if (!perf_event_mmap_match(event, data)) @@ -7232,6 +7276,7 @@ static void perf_event_mmap_output(struct perf_event *event,  	perf_output_end(&handle);  out:  	mmap_event->event_id.header.size = size; +	mmap_event->event_id.header.type = type;  }  static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -9042,26 +9087,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)  	if (task == TASK_TOMBSTONE)  		return; -	if (!ifh->nr_file_filters) -		return; - -	mm = get_task_mm(event->ctx->task); -	if (!mm) -		goto restart; +	if (ifh->nr_file_filters) { +		mm = get_task_mm(event->ctx->task); +		if (!mm) +			goto restart; -	down_read(&mm->mmap_sem); +		down_read(&mm->mmap_sem); +	}  	raw_spin_lock_irqsave(&ifh->lock, flags);  	list_for_each_entry(filter, &ifh->list, entry) { -		event->addr_filter_ranges[count].start = 0; -		event->addr_filter_ranges[count].size = 0; +		if (filter->path.dentry) { +			/* +			 * Adjust base offset if the filter is associated to a +			 * binary that needs to be mapped: +			 */ +			event->addr_filter_ranges[count].start = 0; +			event->addr_filter_ranges[count].size = 0; -		/* -		 * Adjust base offset if the filter is associated to a binary -		 * that needs to be mapped: -		 */ -		if (filter->path.dentry)  			perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); +		} else { +			event->addr_filter_ranges[count].start = filter->offset; +			event->addr_filter_ranges[count].size  = filter->size; +		}  		count++;  	} @@ -9069,9 +9117,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)  	event->addr_filters_gen++;  	raw_spin_unlock_irqrestore(&ifh->lock, flags); -	up_read(&mm->mmap_sem); +	if (ifh->nr_file_filters) { +		up_read(&mm->mmap_sem); -	mmput(mm); +		mmput(mm); +	}  restart:  	perf_event_stop(event, 1); @@ -10234,6 +10284,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	init_waitqueue_head(&event->waitq); +	event->pending_disable = -1;  	init_irq_work(&event->pending, perf_pending_event);  	mutex_init(&event->mmap_mutex); @@ -11876,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void)  	}  } -void perf_swevent_init_cpu(unsigned int cpu) +static void perf_swevent_init_cpu(unsigned int cpu)  {  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 79c47076700a..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -24,7 +24,7 @@ struct ring_buffer {  	atomic_t			poll;		/* POLL_ for wakeups */  	local_t				head;		/* write position    */ -	local_t				nest;		/* nested writers    */ +	unsigned int			nest;		/* nested writers    */  	local_t				events;		/* event limit       */  	local_t				wakeup;		/* wakeup stamp      */  	local_t				lost;		/* nr records lost   */ @@ -41,7 +41,7 @@ struct ring_buffer {  	/* AUX area */  	long				aux_head; -	local_t				aux_nest; +	unsigned int			aux_nest;  	long				aux_wakeup;	/* last aux_watermark boundary crossed by aux_head */  	unsigned long			aux_pgoff;  	int				aux_nr_pages; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a4047321d7d8..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)  	struct ring_buffer *rb = handle->rb;  	preempt_disable(); -	local_inc(&rb->nest); + +	/* +	 * Avoid an explicit LOAD/STORE such that architectures with memops +	 * can use them. +	 */ +	(*(volatile unsigned int *)&rb->nest)++;  	handle->wakeup = local_read(&rb->wakeup);  } @@ -46,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle)  {  	struct ring_buffer *rb = handle->rb;  	unsigned long head; +	unsigned int nest; + +	/* +	 * If this isn't the outermost nesting, we don't have to update +	 * @rb->user_page->data_head. +	 */ +	nest = READ_ONCE(rb->nest); +	if (nest > 1) { +		WRITE_ONCE(rb->nest, nest - 1); +		goto out; +	}  again: +	/* +	 * In order to avoid publishing a head value that goes backwards, +	 * we must ensure the load of @rb->head happens after we've +	 * incremented @rb->nest. +	 * +	 * Otherwise we can observe a @rb->head value before one published +	 * by an IRQ/NMI happening between the load and the increment. +	 */ +	barrier();  	head = local_read(&rb->head);  	/* -	 * IRQ/NMI can happen here, which means we can miss a head update. +	 * IRQ/NMI can happen here and advance @rb->head, causing our +	 * load above to be stale.  	 */ -	if (!local_dec_and_test(&rb->nest)) -		goto out; -  	/*  	 * Since the mmap() consumer (userspace) can run on a different CPU:  	 * @@ -84,14 +107,23 @@ again:  	 * See perf_output_begin().  	 */  	smp_wmb(); /* B, matches C */ -	rb->user_page->data_head = head; +	WRITE_ONCE(rb->user_page->data_head, head);  	/* -	 * Now check if we missed an update -- rely on previous implied -	 * compiler barriers to force a re-read. +	 * We must publish the head before decrementing the nest count, +	 * otherwise an IRQ/NMI can publish a more recent head value and our +	 * write will (temporarily) publish a stale value.  	 */ +	barrier(); +	WRITE_ONCE(rb->nest, 0); + +	/* +	 * Ensure we decrement @rb->nest before we validate the @rb->head. +	 * Otherwise we cannot be sure we caught the 'last' nested update. +	 */ +	barrier();  	if (unlikely(head != local_read(&rb->head))) { -		local_inc(&rb->nest); +		WRITE_ONCE(rb->nest, 1);  		goto again;  	} @@ -330,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,  	struct perf_event *output_event = event;  	unsigned long aux_head, aux_tail;  	struct ring_buffer *rb; +	unsigned int nest;  	if (output_event->parent)  		output_event = output_event->parent; @@ -360,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,  	if (!refcount_inc_not_zero(&rb->aux_refcount))  		goto err; +	nest = READ_ONCE(rb->aux_nest);  	/*  	 * Nesting is not supported for AUX area, make sure nested  	 * writers are caught early  	 */ -	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) +	if (WARN_ON_ONCE(nest))  		goto err_put; +	WRITE_ONCE(rb->aux_nest, nest + 1); +  	aux_head = rb->aux_head;  	handle->rb = rb; @@ -392,9 +428,9 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,  		 * store that will be enabled on successful return  		 */  		if (!handle->size) { /* A, matches D */ -			event->pending_disable = 1; +			event->pending_disable = smp_processor_id();  			perf_output_wakeup(handle); -			local_set(&rb->aux_nest, 0); +			WRITE_ONCE(rb->aux_nest, 0);  			goto err_put;  		}  	} @@ -455,38 +491,35 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)  		rb->aux_head += size;  	} -	if (size || handle->aux_flags) { -		/* -		 * Only send RECORD_AUX if we have something useful to communicate -		 * -		 * Note: the OVERWRITE records by themselves are not considered -		 * useful, as they don't communicate any *new* information, -		 * aside from the short-lived offset, that becomes history at -		 * the next event sched-in and therefore isn't useful. -		 * The userspace that needs to copy out AUX data in overwrite -		 * mode should know to use user_page::aux_head for the actual -		 * offset. So, from now on we don't output AUX records that -		 * have *only* OVERWRITE flag set. -		 */ - -		if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE) -			perf_event_aux_event(handle->event, aux_head, size, -			                     handle->aux_flags); -	} +	/* +	 * Only send RECORD_AUX if we have something useful to communicate +	 * +	 * Note: the OVERWRITE records by themselves are not considered +	 * useful, as they don't communicate any *new* information, +	 * aside from the short-lived offset, that becomes history at +	 * the next event sched-in and therefore isn't useful. +	 * The userspace that needs to copy out AUX data in overwrite +	 * mode should know to use user_page::aux_head for the actual +	 * offset. So, from now on we don't output AUX records that +	 * have *only* OVERWRITE flag set. +	 */ +	if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) +		perf_event_aux_event(handle->event, aux_head, size, +				     handle->aux_flags); -	rb->user_page->aux_head = rb->aux_head; +	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);  	if (rb_need_aux_wakeup(rb))  		wakeup = true;  	if (wakeup) {  		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) -			handle->event->pending_disable = 1; +			handle->event->pending_disable = smp_processor_id();  		perf_output_wakeup(handle);  	}  	handle->event = NULL; -	local_set(&rb->aux_nest, 0); +	WRITE_ONCE(rb->aux_nest, 0);  	/* can't be last */  	rb_free_aux(rb);  	ring_buffer_put(rb); @@ -506,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)  	rb->aux_head += size; -	rb->user_page->aux_head = rb->aux_head; +	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);  	if (rb_need_aux_wakeup(rb)) {  		perf_output_wakeup(handle);  		handle->wakeup = rb->aux_wakeup + rb->aux_watermark; @@ -613,8 +646,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,  	 * PMU requests more than one contiguous chunks of memory  	 * for SW double buffering  	 */ -	if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && -	    !overwrite) { +	if (!overwrite) {  		if (!max_order)  			return -EINVAL; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5cde87329c7..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	struct mmu_notifier_range range;  	struct mem_cgroup *memcg; -	mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, +				addr + PAGE_SIZE);  	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); @@ -2028,7 +2029,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)  		if (uc->handler) {  			rc = uc->handler(uc, regs);  			WARN(rc & ~UPROBE_HANDLER_MASK, -				"bad rc=0x%x from %pf()\n", rc, uc->handler); +				"bad rc=0x%x from %ps()\n", rc, uc->handler);  		}  		if (uc->ret_handler) @@ -2294,16 +2295,14 @@ static struct notifier_block uprobe_exception_nb = {  	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */  }; -static int __init init_uprobes(void) +void __init uprobes_init(void)  {  	int i;  	for (i = 0; i < UPROBES_HASH_SZ; i++)  		mutex_init(&uprobes_mmap_mutex[i]); -	if (percpu_init_rwsem(&dup_mmap_sem)) -		return -ENOMEM; +	BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); -	return register_die_notifier(&uprobe_exception_nb); +	BUG_ON(register_die_notifier(&uprobe_exception_nb));  } -__initcall(init_uprobes); diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..1803efb2922f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/exit.c   * @@ -422,7 +423,7 @@ retry:  	 * freed task structure.  	 */  	if (atomic_read(&mm->mm_users) <= 1) { -		mm->owner = NULL; +		WRITE_ONCE(mm->owner, NULL);  		return;  	} @@ -462,7 +463,7 @@ retry:  	 * most likely racing with swapoff (try_to_unuse()) or /proc or  	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.  	 */ -	mm->owner = NULL; +	WRITE_ONCE(mm->owner, NULL);  	return;  assign_new_owner: @@ -483,7 +484,7 @@ assign_new_owner:  		put_task_struct(c);  		goto retry;  	} -	mm->owner = c; +	WRITE_ONCE(mm->owner, c);  	task_unlock(c);  	put_task_struct(c);  } diff --git a/kernel/extable.c b/kernel/extable.c index 6a5b61ebc66c..e23cce6e6092 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* Rewritten by Rusty Russell, on the backs of many others...     Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. -    This program is free software; you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation; either version 2 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program; if not, write to the Free Software -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  */  #include <linux/ftrace.h>  #include <linux/memory.h> diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)  {  	struct fei_attr *attr = list_entry(v, struct fei_attr, list); -	seq_printf(m, "%pf\n", attr->kp.addr); +	seq_printf(m, "%ps\n", attr->kp.addr);  	return 0;  } diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..75675b9bf6df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/fork.c   * @@ -11,6 +12,7 @@   * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'   */ +#include <linux/anon_inodes.h>  #include <linux/slab.h>  #include <linux/sched/autogroup.h>  #include <linux/sched/mm.h> @@ -21,6 +23,7 @@  #include <linux/sched/task.h>  #include <linux/sched/task_stack.h>  #include <linux/sched/cputime.h> +#include <linux/seq_file.h>  #include <linux/rtmutex.h>  #include <linux/init.h>  #include <linux/unistd.h> @@ -120,7 +123,7 @@  unsigned long total_forks;	/* Handle normal Linux uptimes. */  int nr_threads;			/* The idle threads do not count.. */ -int max_threads;		/* tunable limit on nr_threads */ +static int max_threads;		/* tunable limit on nr_threads */  DEFINE_PER_CPU(unsigned long, process_counts) = 0; @@ -815,6 +818,7 @@ void __init fork_init(void)  #endif  	lockdep_init_task(&init_task); +	uprobes_init();  }  int __weak arch_dup_task_struct(struct task_struct *dst, @@ -952,6 +956,15 @@ static void mm_init_aio(struct mm_struct *mm)  #endif  } +static __always_inline void mm_clear_owner(struct mm_struct *mm, +					   struct task_struct *p) +{ +#ifdef CONFIG_MEMCG +	if (mm->owner == p) +		WRITE_ONCE(mm->owner, NULL); +#endif +} +  static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)  {  #ifdef CONFIG_MEMCG @@ -1222,7 +1235,9 @@ static int wait_for_vfork_done(struct task_struct *child,  	int killed;  	freezer_do_not_count(); +	cgroup_enter_frozen();  	killed = wait_for_completion_killable(vfork); +	cgroup_leave_frozen(false);  	freezer_count();  	if (killed) { @@ -1298,13 +1313,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)  		complete_vfork_done(tsk);  } -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. +/** + * dup_mm() - duplicates an existing mm structure + * @tsk: the task_struct with which the new mm will be associated. + * @oldmm: the mm to duplicate. + * + * Allocates a new mm structure and duplicates the provided @oldmm structure + * content into it. + * + * Return: the duplicated mm or NULL on failure.   */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk, +				struct mm_struct *oldmm)  { -	struct mm_struct *mm, *oldmm = current->mm; +	struct mm_struct *mm;  	int err;  	mm = allocate_mm(); @@ -1331,6 +1353,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)  free_pt:  	/* don't put binfmt in mmput, we haven't got module yet */  	mm->binfmt = NULL; +	mm_init_owner(mm, NULL);  	mmput(mm);  fail_nomem: @@ -1371,7 +1394,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)  	}  	retval = -ENOMEM; -	mm = dup_mm(tsk); +	mm = dup_mm(tsk, current->mm);  	if (!mm)  		goto fail_nomem; @@ -1662,6 +1685,73 @@ static inline void rcu_copy_process(struct task_struct *p)  #endif /* #ifdef CONFIG_TASKS_RCU */  } +static int pidfd_release(struct inode *inode, struct file *file) +{ +	struct pid *pid = file->private_data; + +	file->private_data = NULL; +	put_pid(pid); +	return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ +	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); +	struct pid *pid = f->private_data; + +	seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); +	seq_putc(m, '\n'); +} +#endif + +const struct file_operations pidfd_fops = { +	.release = pidfd_release, +#ifdef CONFIG_PROC_FS +	.show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid:  struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + *         On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ +	int fd; + +	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), +			      O_RDWR | O_CLOEXEC); +	if (fd < 0) +		put_pid(pid); + +	return fd; +} + +static void __delayed_free_task(struct rcu_head *rhp) +{ +	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + +	free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ +	if (IS_ENABLED(CONFIG_MEMCG)) +		call_rcu(&tsk->rcu, __delayed_free_task); +	else +		free_task(tsk); +} +  /*   * This creates a new process as a copy of the old one,   * but does not actually start it yet. @@ -1674,13 +1764,14 @@ static __latent_entropy struct task_struct *copy_process(  					unsigned long clone_flags,  					unsigned long stack_start,  					unsigned long stack_size, +					int __user *parent_tidptr,  					int __user *child_tidptr,  					struct pid *pid,  					int trace,  					unsigned long tls,  					int node)  { -	int retval; +	int pidfd = -1, retval;  	struct task_struct *p;  	struct multiprocess_signals delayed; @@ -1730,6 +1821,31 @@ static __latent_entropy struct task_struct *copy_process(  			return ERR_PTR(-EINVAL);  	} +	if (clone_flags & CLONE_PIDFD) { +		int reserved; + +		/* +		 * - CLONE_PARENT_SETTID is useless for pidfds and also +		 *   parent_tidptr is used to return pidfds. +		 * - CLONE_DETACHED is blocked so that we can potentially +		 *   reuse it later for CLONE_PIDFD. +		 * - CLONE_THREAD is blocked until someone really needs it. +		 */ +		if (clone_flags & +		    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) +			return ERR_PTR(-EINVAL); + +		/* +		 * Verify that parent_tidptr is sane so we can potentially +		 * reuse it later. +		 */ +		if (get_user(reserved, parent_tidptr)) +			return ERR_PTR(-EFAULT); + +		if (reserved != 0) +			return ERR_PTR(-EINVAL); +	} +  	/*  	 * Force any signals received before this point to be delivered  	 * before the fork happens.  Collect up signals sent to multiple @@ -1936,6 +2052,22 @@ static __latent_entropy struct task_struct *copy_process(  		}  	} +	/* +	 * This has to happen after we've potentially unshared the file +	 * descriptor table (so that the pidfd doesn't leak into the child +	 * if the fd table isn't shared). +	 */ +	if (clone_flags & CLONE_PIDFD) { +		retval = pidfd_create(pid); +		if (retval < 0) +			goto bad_fork_free_pid; + +		pidfd = retval; +		retval = put_user(pidfd, parent_tidptr); +		if (retval) +			goto bad_fork_put_pidfd; +	} +  #ifdef CONFIG_BLOCK  	p->plug = NULL;  #endif @@ -1962,7 +2094,7 @@ static __latent_entropy struct task_struct *copy_process(  #ifdef TIF_SYSCALL_EMU  	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);  #endif -	clear_all_latency_tracing(p); +	clear_tsk_latency_tracing(p);  	/* ok, now we should be set up.. */  	p->pid = pid_nr(pid); @@ -1996,7 +2128,7 @@ static __latent_entropy struct task_struct *copy_process(  	 */  	retval = cgroup_can_fork(p);  	if (retval) -		goto bad_fork_free_pid; +		goto bad_fork_cgroup_threadgroup_change_end;  	/*  	 * From this point on we must avoid any synchronous user-space @@ -2111,8 +2243,12 @@ bad_fork_cancel_cgroup:  	spin_unlock(¤t->sighand->siglock);  	write_unlock_irq(&tasklist_lock);  	cgroup_cancel_fork(p); -bad_fork_free_pid: +bad_fork_cgroup_threadgroup_change_end:  	cgroup_threadgroup_change_end(current); +bad_fork_put_pidfd: +	if (clone_flags & CLONE_PIDFD) +		ksys_close(pidfd); +bad_fork_free_pid:  	if (pid != &init_struct_pid)  		free_pid(pid);  bad_fork_cleanup_thread: @@ -2123,8 +2259,10 @@ bad_fork_cleanup_io:  bad_fork_cleanup_namespaces:  	exit_task_namespaces(p);  bad_fork_cleanup_mm: -	if (p->mm) +	if (p->mm) { +		mm_clear_owner(p->mm, p);  		mmput(p->mm); +	}  bad_fork_cleanup_signal:  	if (!(clone_flags & CLONE_THREAD))  		free_signal_struct(p->signal); @@ -2155,7 +2293,7 @@ bad_fork_cleanup_count:  bad_fork_free:  	p->state = TASK_DEAD;  	put_task_stack(p); -	free_task(p); +	delayed_free_task(p);  fork_out:  	spin_lock_irq(¤t->sighand->siglock);  	hlist_del_init(&delayed.node); @@ -2176,7 +2314,7 @@ static inline void init_idle_pids(struct task_struct *idle)  struct task_struct *fork_idle(int cpu)  {  	struct task_struct *task; -	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, +	task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,  			    cpu_to_node(cpu));  	if (!IS_ERR(task)) {  		init_idle_pids(task); @@ -2186,6 +2324,11 @@ struct task_struct *fork_idle(int cpu)  	return task;  } +struct mm_struct *copy_init_mm(void) +{ +	return dup_mm(NULL, &init_mm); +} +  /*   *  Ok, this is the main fork-routine.   * @@ -2223,7 +2366,7 @@ long _do_fork(unsigned long clone_flags,  			trace = 0;  	} -	p = copy_process(clone_flags, stack_start, stack_size, +	p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,  			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);  	add_latent_entropy(); diff --git a/kernel/freezer.c b/kernel/freezer.c index b162b74611e4..c0738424bb43 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kernel/freezer.c - Function to freeze a process   * diff --git a/kernel/futex.c b/kernel/futex.c index c3b73b0311bc..4b5b468c58b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   *  Fast Userspace Mutexes (which I call "Futexes!").   *  (C) Rusty Russell, IBM 2002 @@ -29,20 +30,6 @@   *   *  "The futexes are also cursed."   *  "But they come in a choice of three flavours!" - * - *  This program is free software; you can redistribute it and/or modify - *  it under the terms of the GNU General Public License as published by - *  the Free Software Foundation; either version 2 of the License, or - *  (at your option) any later version. - * - *  This program is distributed in the hope that it will be useful, - *  but WITHOUT ANY WARRANTY; without even the implied warranty of - *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - *  GNU General Public License for more details. - * - *  You should have received a copy of the GNU General Public License - *  along with this program; if not, write to the Free Software - *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA   */  #include <linux/compat.h>  #include <linux/slab.h> @@ -543,7 +530,7 @@ again:  	if (unlikely(should_fail_futex(fshared)))  		return -EFAULT; -	err = get_user_pages_fast(address, 1, 1, &page); +	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);  	/*  	 * If write access is not required (eg. FUTEX_WAIT), try  	 * and get read-only access. @@ -1311,13 +1298,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,  static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)  { +	int err;  	u32 uninitialized_var(curval);  	if (unlikely(should_fail_futex(true)))  		return -EFAULT; -	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) -		return -EFAULT; +	err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); +	if (unlikely(err)) +		return err;  	/* If user space value changed, let the caller retry */  	return curval != uval ? -EAGAIN : 0; @@ -1502,10 +1491,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_  	if (unlikely(should_fail_futex(true)))  		ret = -EFAULT; -	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { -		ret = -EFAULT; - -	} else if (curval != uval) { +	ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); +	if (!ret && (curval != uval)) {  		/*  		 * If a unconditional UNLOCK_PI operation (user space did not  		 * try the TID->0 transition) raced with a waiter setting the @@ -1700,32 +1687,32 @@ retry_private:  	double_lock_hb(hb1, hb2);  	op_ret = futex_atomic_op_inuser(op, uaddr2);  	if (unlikely(op_ret < 0)) { -  		double_unlock_hb(hb1, hb2); -#ifndef CONFIG_MMU -		/* -		 * we don't get EFAULT from MMU faults if we don't have an MMU, -		 * but we might get them from range checking -		 */ -		ret = op_ret; -		goto out_put_keys; -#endif - -		if (unlikely(op_ret != -EFAULT)) { +		if (!IS_ENABLED(CONFIG_MMU) || +		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { +			/* +			 * we don't get EFAULT from MMU faults if we don't have +			 * an MMU, but we might get them from range checking +			 */  			ret = op_ret;  			goto out_put_keys;  		} -		ret = fault_in_user_writeable(uaddr2); -		if (ret) -			goto out_put_keys; +		if (op_ret == -EFAULT) { +			ret = fault_in_user_writeable(uaddr2); +			if (ret) +				goto out_put_keys; +		} -		if (!(flags & FLAGS_SHARED)) +		if (!(flags & FLAGS_SHARED)) { +			cond_resched();  			goto retry_private; +		}  		put_futex_key(&key2);  		put_futex_key(&key1); +		cond_resched();  		goto retry;  	} @@ -2350,7 +2337,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,  	u32 uval, uninitialized_var(curval), newval;  	struct task_struct *oldowner, *newowner;  	u32 newtid; -	int ret; +	int ret, err = 0;  	lockdep_assert_held(q->lock_ptr); @@ -2421,14 +2408,17 @@ retry:  	if (!pi_state->owner)  		newtid |= FUTEX_OWNER_DIED; -	if (get_futex_value_locked(&uval, uaddr)) -		goto handle_fault; +	err = get_futex_value_locked(&uval, uaddr); +	if (err) +		goto handle_err;  	for (;;) {  		newval = (uval & FUTEX_OWNER_DIED) | newtid; -		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) -			goto handle_fault; +		err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); +		if (err) +			goto handle_err; +  		if (curval == uval)  			break;  		uval = curval; @@ -2456,23 +2446,37 @@ retry:  	return 0;  	/* -	 * To handle the page fault we need to drop the locks here. That gives -	 * the other task (either the highest priority waiter itself or the -	 * task which stole the rtmutex) the chance to try the fixup of the -	 * pi_state. So once we are back from handling the fault we need to -	 * check the pi_state after reacquiring the locks and before trying to -	 * do another fixup. When the fixup has been done already we simply -	 * return. +	 * In order to reschedule or handle a page fault, we need to drop the +	 * locks here. In the case of a fault, this gives the other task +	 * (either the highest priority waiter itself or the task which stole +	 * the rtmutex) the chance to try the fixup of the pi_state. So once we +	 * are back from handling the fault we need to check the pi_state after +	 * reacquiring the locks and before trying to do another fixup. When +	 * the fixup has been done already we simply return.  	 *  	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely  	 * drop hb->lock since the caller owns the hb -> futex_q relation.  	 * Dropping the pi_mutex->wait_lock requires the state revalidate.  	 */ -handle_fault: +handle_err:  	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);  	spin_unlock(q->lock_ptr); -	ret = fault_in_user_writeable(uaddr); +	switch (err) { +	case -EFAULT: +		ret = fault_in_user_writeable(uaddr); +		break; + +	case -EAGAIN: +		cond_resched(); +		ret = 0; +		break; + +	default: +		WARN_ON_ONCE(1); +		ret = err; +		break; +	}  	spin_lock(q->lock_ptr);  	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); @@ -3041,10 +3045,8 @@ retry:  		 * A unconditional UNLOCK_PI op raced against a waiter  		 * setting the FUTEX_WAITERS bit. Try again.  		 */ -		if (ret == -EAGAIN) { -			put_futex_key(&key); -			goto retry; -		} +		if (ret == -EAGAIN) +			goto pi_retry;  		/*  		 * wake_futex_pi has detected invalid state. Tell user  		 * space. @@ -3059,9 +3061,19 @@ retry:  	 * preserve the WAITERS bit not the OWNER_DIED one. We are the  	 * owner.  	 */ -	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { +	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {  		spin_unlock(&hb->lock); -		goto pi_faulted; +		switch (ret) { +		case -EFAULT: +			goto pi_faulted; + +		case -EAGAIN: +			goto pi_retry; + +		default: +			WARN_ON_ONCE(1); +			goto out_putkey; +		}  	}  	/* @@ -3075,6 +3087,11 @@ out_putkey:  	put_futex_key(&key);  	return ret; +pi_retry: +	put_futex_key(&key); +	cond_resched(); +	goto retry; +  pi_faulted:  	put_futex_key(&key); @@ -3435,47 +3452,67 @@ err_unlock:  static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)  {  	u32 uval, uninitialized_var(nval), mval; +	int err; + +	/* Futex address must be 32bit aligned */ +	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) +		return -1;  retry:  	if (get_user(uval, uaddr))  		return -1; -	if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { -		/* -		 * Ok, this dying thread is truly holding a futex -		 * of interest. Set the OWNER_DIED bit atomically -		 * via cmpxchg, and if the value had FUTEX_WAITERS -		 * set, wake up a waiter (if any). (We have to do a -		 * futex_wake() even if OWNER_DIED is already set - -		 * to handle the rare but possible case of recursive -		 * thread-death.) The rest of the cleanup is done in -		 * userspace. -		 */ -		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; -		/* -		 * We are not holding a lock here, but we want to have -		 * the pagefault_disable/enable() protection because -		 * we want to handle the fault gracefully. If the -		 * access fails we try to fault in the futex with R/W -		 * verification via get_user_pages. get_user() above -		 * does not guarantee R/W access. If that fails we -		 * give up and leave the futex locked. -		 */ -		if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { +	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) +		return 0; + +	/* +	 * Ok, this dying thread is truly holding a futex +	 * of interest. Set the OWNER_DIED bit atomically +	 * via cmpxchg, and if the value had FUTEX_WAITERS +	 * set, wake up a waiter (if any). (We have to do a +	 * futex_wake() even if OWNER_DIED is already set - +	 * to handle the rare but possible case of recursive +	 * thread-death.) The rest of the cleanup is done in +	 * userspace. +	 */ +	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + +	/* +	 * We are not holding a lock here, but we want to have +	 * the pagefault_disable/enable() protection because +	 * we want to handle the fault gracefully. If the +	 * access fails we try to fault in the futex with R/W +	 * verification via get_user_pages. get_user() above +	 * does not guarantee R/W access. If that fails we +	 * give up and leave the futex locked. +	 */ +	if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { +		switch (err) { +		case -EFAULT:  			if (fault_in_user_writeable(uaddr))  				return -1;  			goto retry; -		} -		if (nval != uval) + +		case -EAGAIN: +			cond_resched();  			goto retry; -		/* -		 * Wake robust non-PI futexes here. The wakeup of -		 * PI futexes happens in exit_pi_state(): -		 */ -		if (!pi && (uval & FUTEX_WAITERS)) -			futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); +		default: +			WARN_ON_ONCE(1); +			return err; +		}  	} + +	if (nval != uval) +		goto retry; + +	/* +	 * Wake robust non-PI futexes here. The wakeup of +	 * PI futexes happens in exit_pi_state(): +	 */ +	if (!pi && (uval & FUTEX_WAITERS)) +		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); +  	return 0;  } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  menu "GCOV-based kernel profiling"  config GCOV_KERNEL @@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL  choice  	prompt "Specify GCOV format"  	depends on GCOV_KERNEL +	depends on CC_IS_GCC  	---help---  	The gcov format is usually determined by the GCC version, and the  	default is chosen according to your GCC version. However, there are @@ -62,7 +64,7 @@ choice  config GCOV_FORMAT_3_4  	bool "GCC 3.4 format" -	depends on CC_IS_GCC && GCC_VERSION < 40700 +	depends on GCC_VERSION < 40700  	---help---  	Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,6 @@  ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'  obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@  #include <linux/sched.h>  #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ -	static unsigned int gcov_version; - -	mutex_lock(&gcov_lock); -	if (gcov_version == 0) { -		gcov_version = gcov_info_version(info); -		/* -		 * Printing gcc's version magic may prove useful for debugging -		 * incompatibility reports. -		 */ -		pr_info("version magic: 0x%x\n", gcov_version); -	} -	/* -	 * Add new profiling data structure to list and inform event -	 * listener. -	 */ -	gcov_info_link(info); -	if (gcov_events_enabled) -		gcov_event(GCOV_ADD, info); -	mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ -	/* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock);  /**   * gcov_enable_events - enable event reporting through gcov_event() @@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,  	/* Remove entries located in module from linked list. */  	while ((info = gcov_info_next(info))) { -		if (within_module((unsigned long)info, mod)) { +		if (gcov_info_within_module(info, mod)) {  			gcov_info_unlink(prev, info);  			if (gcov_events_enabled)  				gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data.  LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + *    llvm_gcda_start_file() + *      llvm_gcda_emit_function() + *      llvm_gcda_emit_arcs() + *      llvm_gcda_emit_function() + *      llvm_gcda_emit_arcs() + *      [... repeats for each function ...] + *    llvm_gcda_summary_info() + *    llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit.  This forces us to keep some local state + * about which module we're dealing with at the moment.  On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt)	"gcov: " fmt + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { +	struct list_head head; + +	const char *filename; +	unsigned int version; +	u32 checksum; + +	struct list_head functions; +}; + +struct gcov_fn_info { +	struct list_head head; + +	u32 ident; +	u32 checksum; +	u8 use_extra_checksum; +	u32 cfg_checksum; + +	u32 num_counters; +	u64 *counters; +	const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ +	struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + +	if (!info) +		return; + +	INIT_LIST_HEAD(&info->head); +	INIT_LIST_HEAD(&info->functions); + +	mutex_lock(&gcov_lock); + +	list_add_tail(&info->head, &clang_gcov_list); +	current_info = info; +	writeout(); +	current_info = NULL; +	if (gcov_events_enabled) +		gcov_event(GCOV_ADD, info); + +	mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], +		u32 checksum) +{ +	current_info->filename = orig_filename; +	memcpy(¤t_info->version, version, sizeof(current_info->version)); +	current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, +		u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ +	struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + +	if (!info) +		return; + +	INIT_LIST_HEAD(&info->head); +	info->ident = ident; +	info->checksum = func_checksum; +	info->use_extra_checksum = use_extra_checksum; +	info->cfg_checksum = cfg_checksum; +	if (function_name) +		info->function_name = kstrdup(function_name, GFP_KERNEL); + +	list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ +	struct gcov_fn_info *info = list_last_entry(¤t_info->functions, +			struct gcov_fn_info, head); + +	info->num_counters = num_counters; +	info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ +	return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ +	return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ +	if (!info) +		return list_first_entry_or_null(&clang_gcov_list, +				struct gcov_info, head); +	if (list_is_last(&info->head, &clang_gcov_list)) +		return NULL; +	return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ +	list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ +	/* Generic code unlinks while iterating. */ +	__list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ +	return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { +	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */ +	{ 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ +	struct gcov_fn_info *fn; + +	list_for_each_entry(fn, &info->functions, head) +		memset(fn->counters, 0, +				sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ +	struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( +			&info1->functions, struct gcov_fn_info, head); +	struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( +			&info2->functions, struct gcov_fn_info, head); + +	if (info1->checksum != info2->checksum) +		return false; +	if (!fn_ptr1) +		return fn_ptr1 == fn_ptr2; +	while (!list_is_last(&fn_ptr1->head, &info1->functions) && +		!list_is_last(&fn_ptr2->head, &info2->functions)) { +		if (fn_ptr1->checksum != fn_ptr2->checksum) +			return false; +		if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) +			return false; +		if (fn_ptr1->use_extra_checksum && +			fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) +			return false; +		fn_ptr1 = list_next_entry(fn_ptr1, head); +		fn_ptr2 = list_next_entry(fn_ptr2, head); +	} +	return list_is_last(&fn_ptr1->head, &info1->functions) && +		list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ +	struct gcov_fn_info *dfn_ptr; +	struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, +			struct gcov_fn_info, head); + +	list_for_each_entry(dfn_ptr, &dst->functions, head) { +		u32 i; + +		for (i = 0; i < sfn_ptr->num_counters; i++) +			dfn_ptr->counters[i] += sfn_ptr->counters[i]; +	} +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ +	size_t cv_size; /* counter values size */ +	struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), +			GFP_KERNEL); +	if (!fn_dup) +		return NULL; +	INIT_LIST_HEAD(&fn_dup->head); + +	fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); +	if (!fn_dup->function_name) +		goto err_name; + +	cv_size = fn->num_counters * sizeof(fn->counters[0]); +	fn_dup->counters = vmalloc(cv_size); +	if (!fn_dup->counters) +		goto err_counters; +	memcpy(fn_dup->counters, fn->counters, cv_size); + +	return fn_dup; + +err_counters: +	kfree(fn_dup->function_name); +err_name: +	kfree(fn_dup); +	return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ +	struct gcov_info *dup; +	struct gcov_fn_info *fn; + +	dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); +	if (!dup) +		return NULL; +	INIT_LIST_HEAD(&dup->head); +	INIT_LIST_HEAD(&dup->functions); +	dup->filename = kstrdup(info->filename, GFP_KERNEL); +	if (!dup->filename) +		goto err; + +	list_for_each_entry(fn, &info->functions, head) { +		struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + +		if (!fn_dup) +			goto err; +		list_add_tail(&fn_dup->head, &dup->functions); +	} + +	return dup; + +err: +	gcov_info_free(dup); +	return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ +	struct gcov_fn_info *fn, *tmp; + +	list_for_each_entry_safe(fn, tmp, &info->functions, head) { +		kfree(fn->function_name); +		vfree(fn->counters); +		list_del(&fn->head); +		kfree(fn); +	} +	kfree(info->filename); +	kfree(info); +} + +#define ITER_STRIDE	PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { +	struct gcov_info *info; +	void *buffer; +	size_t size; +	loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ +	u32 *data; + +	if (buffer) { +		data = buffer + off; +		*data = v; +	} + +	return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ +	u32 *data; + +	if (buffer) { +		data = buffer + off; + +		data[0] = (v & 0xffffffffUL); +		data[1] = (v >> 32); +	} + +	return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ +	struct gcov_fn_info *fi_ptr; +	size_t pos = 0; + +	/* File header. */ +	pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); +	pos += store_gcov_u32(buffer, pos, info->version); +	pos += store_gcov_u32(buffer, pos, info->checksum); + +	list_for_each_entry(fi_ptr, &info->functions, head) { +		u32 i; +		u32 len = 2; + +		if (fi_ptr->use_extra_checksum) +			len++; + +		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); +		pos += store_gcov_u32(buffer, pos, len); +		pos += store_gcov_u32(buffer, pos, fi_ptr->ident); +		pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); +		if (fi_ptr->use_extra_checksum) +			pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + +		pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); +		pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); +		for (i = 0; i < fi_ptr->num_counters; i++) +			pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); +	} + +	return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ +	struct gcov_iterator *iter; + +	iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); +	if (!iter) +		goto err_free; + +	iter->info = info; +	/* Dry-run to get the actual buffer size. */ +	iter->size = convert_to_gcda(NULL, info); +	iter->buffer = vmalloc(iter->size); +	if (!iter->buffer) +		goto err_free; + +	convert_to_gcda(iter->buffer, info); + +	return iter; + +err_free: +	kfree(iter); +	return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ +	vfree(iter->buffer); +	kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ +	return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ +	iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ +	if (iter->pos < iter->size) +		iter->pos += ITER_STRIDE; + +	if (iter->pos >= iter->size) +		return -EINVAL; + +	return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ +	size_t len; + +	if (iter->pos >= iter->size) +		return -EINVAL; + +	len = ITER_STRIDE; +	if (iter->pos + len > iter->size) +		len = iter->size - iter->pos; + +	seq_write(seq, iter->buffer + iter->pos, len); + +	return 0; +} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)  		gcov_info_head = info->next;  } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ +	return within_module((unsigned long)info, mod); +} +  /* Symbolic links to be created for each profiling data file. */  const struct gcov_link gcov_link[] = {  	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)  		gcov_info_head = info->next;  } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ +	return within_module((unsigned long)info, mod); +} +  /* Symbolic links to be created for each profiling data file. */  const struct gcov_link gcov_link[] = {  	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ +	static unsigned int gcov_version; + +	mutex_lock(&gcov_lock); +	if (gcov_version == 0) { +		gcov_version = gcov_info_version(info); +		/* +		 * Printing gcc's version magic may prove useful for debugging +		 * incompatibility reports. +		 */ +		pr_info("version magic: 0x%x\n", gcov_version); +	} +	/* +	 * Add new profiling data structure to list and inform event +	 * listener. +	 */ +	gcov_info_link(info); +	if (gcov_events_enabled) +		gcov_event(GCOV_ADD, info); +	mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@  #ifndef GCOV_H  #define GCOV_H GCOV_H +#include <linux/module.h>  #include <linux/types.h>  /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);  struct gcov_info *gcov_info_next(struct gcov_info *info);  void gcov_info_link(struct gcov_info *info);  void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod);  /* Base interface. */  enum gcov_action { @@ -83,4 +85,7 @@ struct gcov_link {  };  extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; +  #endif /* GCOV_H */ diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS_PROC. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; 	iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f                       | +		grep -v "include/generated/compile.h"		   | +		xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f                       | +		grep -v "include/generated/compile.h"		   | +		xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && +	[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && +	[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && +	[ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then +		exit +fi + +if [ "${quiet}" != "silent_" ]; then +       echo "  GEN     $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; +	do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; +	do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | +	xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f108a95882c6..14a625c16cb3 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Detect Hung Task   * diff --git a/kernel/iomem.c b/kernel/iomem.c index f7525e14ebc6..93c264444510 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,   *   * MEMREMAP_WB - matches the default mapping for System RAM on   * the architecture.  This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM   * memremap() will bypass establishing a new mapping and instead return   * a pointer into the direct map.   * @@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)  	/* Try all mapping types requested until one returns non-NULL */  	if (flags & MEMREMAP_WB) {  		/* -		 * MEMREMAP_WB is special in that it can be satisifed +		 * MEMREMAP_WB is special in that it can be satisfied  		 * from the direct map.  Some archs depend on the  		 * capability of memremap() to autodetect cases where  		 * the requested range is potentially in System RAM. diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5f3e2baefca9..f92d9a687372 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  menu "IRQ subsystem"  # Options selectable by the architecture code @@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN  	select IRQ_DOMAIN_HIERARCHY  	select GENERIC_MSI_IRQ +config IRQ_MSI_IOMMU +	bool +  config HANDLE_DOMAIN_IRQ  	bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3faef4a77f71..29d6c7d070b4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1449,12 +1449,43 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)  int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)  {  	data = data->parent_data; + +	if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) +		return 0; +  	if (data->chip->irq_set_wake)  		return data->chip->irq_set_wake(data, on);  	return -ENOSYS;  }  EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); + +/** + * irq_chip_request_resources_parent - Request resources on the parent interrupt + * @data:	Pointer to interrupt specific data + */ +int irq_chip_request_resources_parent(struct irq_data *data) +{ +	data = data->parent_data; + +	if (data->chip->irq_request_resources) +		return data->chip->irq_request_resources(data); + +	return -ENOSYS; +} +EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); + +/** + * irq_chip_release_resources_parent - Release resources on the parent interrupt + * @data:	Pointer to interrupt specific data + */ +void irq_chip_release_resources_parent(struct irq_data *data) +{ +	data = data->parent_data; +	if (data->chip->irq_release_resources) +		data->chip->irq_release_resources(data); +} +EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);  #endif  /** diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)  	raw_spin_lock_irq(&desc->lock);  	data = irq_desc_get_irq_data(desc); -	seq_printf(m, "handler:  %pf\n", desc->handle_irq); +	seq_printf(m, "handler:  %ps\n", desc->handle_irq);  	seq_printf(m, "device:   %s\n", desc->dev_name);  	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);  	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 5d5378ea0afe..f6e5515ee077 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq);   *	@dev: device to request interrupt for   *	@irq: Interrupt line to allocate   *	@handler: Function to be called when the IRQ occurs - *	@thread_fn: function to be called in a threaded interrupt context. NULL - *		    for devices which handle everything in @handler   *	@irqflags: Interrupt type flags   *	@devname: An ascii name for the claiming device, dev_name(dev) if NULL   *	@dev_id: A cookie passed back to the handler function @@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,  			    irq_flow_handler_t handler)  {  	struct irq_chip_generic *gc; -	unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); -	gc = devm_kzalloc(dev, sz, GFP_KERNEL); +	gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);  	if (gc)  		irq_init_generic_chip(gc, name, num_ct,  				      irq_base, reg_base, handler); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags  		res = action->handler(irq, action->dev_id);  		trace_irq_handler_exit(irq, action, res); -		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", +		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",  			      irq, action->handler))  			local_irq_disable(); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 13539e12cd80..c52b737ab8e3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = {  	&actions_attr.attr,  	NULL  }; +ATTRIBUTE_GROUPS(irq);  static struct kobj_type irq_kobj_type = {  	.release	= irq_kobj_release,  	.sysfs_ops	= &kobj_sysfs_ops, -	.default_attrs	= irq_attrs, +	.default_groups = irq_groups,  };  static void irq_sysfs_add(int irq, struct irq_desc *desc) @@ -558,6 +559,7 @@ int __init early_irq_init(void)  		alloc_masks(&desc[i], node);  		raw_spin_lock_init(&desc[i].lock);  		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); +		mutex_init(&desc[i].request_mutex);  		desc_set_defaults(i, &desc[i], node, NULL, NULL);  	}  	return arch_early_irq_init(); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9ed29e4a7dbf..a453e229f99c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,  /**   * __irq_domain_alloc_irqs - Allocate IRQs from domain   * @domain:	domain to allocate from - * @irq_base:	allocate specified IRQ nubmer if irq_base >= 0 + * @irq_base:	allocate specified IRQ number if irq_base >= 0   * @nr_irqs:	number of IRQs to allocate   * @node:	NUMA node id for memory allocation   * @arg:	domain specific argument diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ec34a2a6638..78f3ddeb7fe4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,  	case IRQ_SET_MASK_OK:  	case IRQ_SET_MASK_OK_DONE:  		cpumask_copy(desc->irq_common_data.affinity, mask); +		/* fall through */  	case IRQ_SET_MASK_OK_NOCOPY:  		irq_validate_effective_affinity(data);  		irq_set_thread_affinity(desc); @@ -356,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)  	desc->affinity_notify = notify;  	raw_spin_unlock_irqrestore(&desc->lock, flags); -	if (old_notify) +	if (old_notify) { +		cancel_work_sync(&old_notify->work);  		kref_put(&old_notify->kref, old_notify->release); +	}  	return 0;  } @@ -778,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)  		ret = 0;  		break;  	default: -		pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", +		pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",  		       flags, irq_desc_get_irq(desc), chip->irq_set_type);  	}  	if (unmask) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)  	 */  	raw_spin_lock_irqsave(&desc->lock, flags);  	for_each_action_of_desc(desc, action) { -		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); +		printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);  		if (action->thread_fn) -			printk(KERN_CONT " threaded [<%p>] %pf", +			printk(KERN_CONT " threaded [<%p>] %ps",  					action->thread_fn, action->thread_fn);  		printk(KERN_CONT "\n");  	} diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..90c735da15d0 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -9,6 +9,7 @@  #include <linux/idr.h>  #include <linux/irq.h>  #include <linux/math64.h> +#include <linux/log2.h>  #include <trace/events/irq.h> @@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);  DEFINE_PER_CPU(struct irq_timings, irq_timings); -struct irqt_stat { -	u64	next_evt; -	u64	last_ts; -	u64	variance; -	u32	avg; -	u32	nr_samples; -	int	anomalies; -	int	valid; -}; -  static DEFINE_IDR(irqt_stats);  void irq_timings_enable(void) @@ -40,75 +31,360 @@ void irq_timings_disable(void)  	static_branch_disable(&irq_timing_enabled);  } -/** - * irqs_update - update the irq timing statistics with a new timestamp +/* + * The main goal of this algorithm is to predict the next interrupt + * occurrence on the current CPU. + * + * Currently, the interrupt timings are stored in a circular array + * buffer every time there is an interrupt, as a tuple: the interrupt + * number and the associated timestamp when the event occurred <irq, + * timestamp>. + * + * For every interrupt occurring in a short period of time, we can + * measure the elapsed time between the occurrences for the same + * interrupt and we end up with a suite of intervals. The experience + * showed the interrupts are often coming following a periodic + * pattern. + * + * The objective of the algorithm is to find out this periodic pattern + * in a fastest way and use its period to predict the next irq event. + * + * When the next interrupt event is requested, we are in the situation + * where the interrupts are disabled and the circular buffer + * containing the timings is filled with the events which happened + * after the previous next-interrupt-event request. + * + * At this point, we read the circular buffer and we fill the irq + * related statistics structure. After this step, the circular array + * containing the timings is empty because all the values are + * dispatched in their corresponding buffers. + * + * Now for each interrupt, we can predict the next event by using the + * suffix array, log interval and exponential moving average + * + * 1. Suffix array + * + * Suffix array is an array of all the suffixes of a string. It is + * widely used as a data structure for compression, text search, ... + * For instance for the word 'banana', the suffixes will be: 'banana' + * 'anana' 'nana' 'ana' 'na' 'a' + * + * Usually, the suffix array is sorted but for our purpose it is + * not necessary and won't provide any improvement in the context of + * the solved problem where we clearly define the boundaries of the + * search by a max period and min period. + * + * The suffix array will build a suite of intervals of different + * length and will look for the repetition of each suite. If the suite + * is repeating then we have the period because it is the length of + * the suite whatever its position in the buffer. + * + * 2. Log interval + * + * We saw the irq timings allow to compute the interval of the + * occurrences for a specific interrupt. We can reasonibly assume the + * longer is the interval, the higher is the error for the next event + * and we can consider storing those interval values into an array + * where each slot in the array correspond to an interval at the power + * of 2 of the index. For example, index 12 will contain values + * between 2^11 and 2^12. + * + * At the end we have an array of values where at each index defines a + * [2^index - 1, 2 ^ index] interval values allowing to store a large + * number of values inside a small array. + * + * For example, if we have the value 1123, then we store it at + * ilog2(1123) = 10 index value. + * + * Storing those value at the specific index is done by computing an + * exponential moving average for this specific slot. For instance, + * for values 1800, 1123, 1453, ... fall under the same slot (10) and + * the exponential moving average is computed every time a new value + * is stored at this slot. + * + * 3. Exponential Moving Average + * + * The EMA is largely used to track a signal for stocks or as a low + * pass filter. The magic of the formula, is it is very simple and the + * reactivity of the average can be tuned with the factors called + * alpha. + * + * The higher the alphas are, the faster the average respond to the + * signal change. In our case, if a slot in the array is a big + * interval, we can have numbers with a big difference between + * them. The impact of those differences in the average computation + * can be tuned by changing the alpha value. + * + * + *  -- The algorithm -- + * + * We saw the different processing above, now let's see how they are + * used together. + * + * For each interrupt: + *	For each interval: + *		Compute the index = ilog2(interval) + *		Compute a new_ema(buffer[index], interval) + *		Store the index in a circular buffer + * + *	Compute the suffix array of the indexes + * + *	For each suffix: + *		If the suffix is reverse-found 3 times + *			Return suffix + * + *	Return Not found + * + * However we can not have endless suffix array to be build, it won't + * make sense and it will add an extra overhead, so we can restrict + * this to a maximum suffix length of 5 and a minimum suffix length of + * 2. The experience showed 5 is the majority of the maximum pattern + * period found for different devices. + * + * The result is a pattern finding less than 1us for an interrupt.   * - * @irqs: an irqt_stat struct pointer - * @ts: the new timestamp + * Example based on real values:   * - * The statistics are computed online, in other words, the code is - * designed to compute the statistics on a stream of values rather - * than doing multiple passes on the values to compute the average, - * then the variance. The integer division introduces a loss of - * precision but with an acceptable error margin regarding the results - * we would have with the double floating precision: we are dealing - * with nanosec, so big numbers, consequently the mantisse is - * negligeable, especially when converting the time in usec - * afterwards. + * Example 1 : MMC write/read interrupt interval:   * - * The computation happens at idle time. When the CPU is not idle, the - * interrupts' timestamps are stored in the circular buffer, when the - * CPU goes idle and this routine is called, all the buffer's values - * are injected in the statistical model continuying to extend the - * statistics from the previous busy-idle cycle. + *	223947, 1240, 1384, 1386, 1386, + *	217416, 1236, 1384, 1386, 1387, + *	214719, 1241, 1386, 1387, 1384, + *	213696, 1234, 1384, 1386, 1388, + *	219904, 1240, 1385, 1389, 1385, + *	212240, 1240, 1386, 1386, 1386, + *	214415, 1236, 1384, 1386, 1387, + *	214276, 1234, 1384, 1388, ?   * - * The observations showed a device will trigger a burst of periodic - * interrupts followed by one or two peaks of longer time, for - * instance when a SD card device flushes its cache, then the periodic - * intervals occur again. A one second inactivity period resets the - * stats, that gives us the certitude the statistical values won't - * exceed 1x10^9, thus the computation won't overflow. + * For each element, apply ilog2(value)   * - * Basically, the purpose of the algorithm is to watch the periodic - * interrupts and eliminate the peaks. + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, ?   * - * An interrupt is considered periodically stable if the interval of - * its occurences follow the normal distribution, thus the values - * comply with: + * Max period of 5, we take the last (max_period * 3) 15 elements as + * we can be confident if the pattern repeats itself three times it is + * a repeating pattern.   * - *      avg - 3 x stddev < value < avg + 3 x stddev + *	             8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, ?   * - * Which can be simplified to: + * Suffixes are:   * - *      -3 x stddev < value - avg < 3 x stddev + *  1) 8, 15, 8, 8, 8  <- max period + *  2) 8, 15, 8, 8 + *  3) 8, 15, 8 + *  4) 8, 15           <- min period   * - *      abs(value - avg) < 3 x stddev + * From there we search the repeating pattern for each suffix.   * - * In order to save a costly square root computation, we use the - * variance. For the record, stddev = sqrt(variance). The equation - * above becomes: + * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 + *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  | + *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  | + *                         8, 15, 8, 8, 8  |   |  |  |  | + *                                         8, 15, 8, 8, 8   * - *      abs(value - avg) < 3 x sqrt(variance) + * When moving the suffix, we found exactly 3 matches.   * - * And finally we square it: + * The first suffix with period 5 is repeating.   * - *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 + * The next event is (3 * max_period) % suffix_period   * - *      (value - avg) x (value - avg) < 9 x variance + * In this example, the result 0, so the next event is suffix[0] => 8   * - * Statistically speaking, any values out of this interval is - * considered as an anomaly and is discarded. However, a normal - * distribution appears when the number of samples is 30 (it is the - * rule of thumb in statistics, cf. "30 samples" on Internet). When - * there are three consecutive anomalies, the statistics are resetted. + * However, 8 is the index in the array of exponential moving average + * which was calculated on the fly when storing the values, so the + * interval is ema[8] = 1366   * + * + * Example 2: + * + *	4, 3, 5, 100, + *	3, 3, 5, 117, + *	4, 4, 5, 112, + *	4, 3, 4, 110, + *	3, 5, 3, 117, + *	4, 4, 5, 112, + *	4, 3, 4, 110, + *	3, 4, 5, 112, + *	4, 3, 4, 110 + * + * ilog2 + * + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4 + * + * Max period 5: + *	   0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4 + * + * Suffixes: + * + *  1) 0, 0, 4, 0, 0 + *  2) 0, 0, 4, 0 + *  3) 0, 0, 4 + *  4) 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + *         |  |  |  |  |  |  X + *         0, 0, 4, 0, 0, |  X + *                        0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  | + *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  | + *                     0, 0, 4, 0, |  |  |  |  |  |  | + *                                 0, 0, 4, 0, |  |  | + *                                             0  0  4 + * + * Pattern is found 3 times, the remaining is 1 which results from + * (max_period * 3) % suffix_period. This value is the index in the + * suffix arrays. The suffix array for a period 4 has the value 4 + * at index 1. + */ +#define EMA_ALPHA_VAL		64 +#define EMA_ALPHA_SHIFT		7 + +#define PREDICTION_PERIOD_MIN	2 +#define PREDICTION_PERIOD_MAX	5 +#define PREDICTION_FACTOR	4 +#define PREDICTION_MAX		10 /* 2 ^ PREDICTION_MAX useconds */ +#define PREDICTION_BUFFER_SIZE	16 /* slots for EMAs, hardly more than 16 */ + +struct irqt_stat { +	u64	last_ts; +	u64	ema_time[PREDICTION_BUFFER_SIZE]; +	int	timings[IRQ_TIMINGS_SIZE]; +	int	circ_timings[IRQ_TIMINGS_SIZE]; +	int	count; +}; + +/* + * Exponential moving average computation   */ -static void irqs_update(struct irqt_stat *irqs, u64 ts) +static u64 irq_timings_ema_new(u64 value, u64 ema_old) +{ +	s64 diff; + +	if (unlikely(!ema_old)) +		return value; + +	diff = (value - ema_old) * EMA_ALPHA_VAL; +	/* +	 * We can use a s64 type variable to be added with the u64 +	 * ema_old variable as this one will never have its topmost +	 * bit set, it will be always smaller than 2^63 nanosec +	 * interrupt interval (292 years). +	 */ +	return ema_old + (diff >> EMA_ALPHA_SHIFT); +} + +static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) +{ +	int i; + +	/* +	 * The buffer contains the suite of intervals, in a ilog2 +	 * basis, we are looking for a repetition. We point the +	 * beginning of the search three times the length of the +	 * period beginning at the end of the buffer. We do that for +	 * each suffix. +	 */ +	for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { + +		int *begin = &buffer[len - (i * 3)]; +		int *ptr = begin; + +		/* +		 * We look if the suite with period 'i' repeat +		 * itself. If it is truncated at the end, as it +		 * repeats we can use the period to find out the next +		 * element. +		 */ +		while (!memcmp(ptr, begin, i * sizeof(*ptr))) { +			ptr += i; +			if (ptr >= &buffer[len]) +				return begin[((i * 3) % i)]; +		} +	} + +	return -1; +} + +static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) +{ +	int index, i, period_max, count, start, min = INT_MAX; + +	if ((now - irqs->last_ts) >= NSEC_PER_SEC) { +		irqs->count = irqs->last_ts = 0; +		return U64_MAX; +	} + +	/* +	 * As we want to find three times the repetition, we need a +	 * number of intervals greater or equal to three times the +	 * maximum period, otherwise we truncate the max period. +	 */ +	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? +		PREDICTION_PERIOD_MAX : irqs->count / 3; + +	/* +	 * If we don't have enough irq timings for this prediction, +	 * just bail out. +	 */ +	if (period_max <= PREDICTION_PERIOD_MIN) +		return U64_MAX; + +	/* +	 * 'count' will depends if the circular buffer wrapped or not +	 */ +	count = irqs->count < IRQ_TIMINGS_SIZE ? +		irqs->count : IRQ_TIMINGS_SIZE; + +	start = irqs->count < IRQ_TIMINGS_SIZE ? +		0 : (irqs->count & IRQ_TIMINGS_MASK); + +	/* +	 * Copy the content of the circular buffer into another buffer +	 * in order to linearize the buffer instead of dealing with +	 * wrapping indexes and shifted array which will be prone to +	 * error and extremelly difficult to debug. +	 */ +	for (i = 0; i < count; i++) { +		int index = (start + i) & IRQ_TIMINGS_MASK; + +		irqs->timings[i] = irqs->circ_timings[index]; +		min = min_t(int, irqs->timings[i], min); +	} + +	index = irq_timings_next_event_index(irqs->timings, count, period_max); +	if (index < 0) +		return irqs->last_ts + irqs->ema_time[min]; + +	return irqs->last_ts + irqs->ema_time[index]; +} + +static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)  {  	u64 old_ts = irqs->last_ts; -	u64 variance = 0;  	u64 interval; -	s64 diff; +	int index;  	/*  	 * The timestamps are absolute time values, we need to compute @@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)  	 * want as we need another timestamp to compute an interval.  	 */  	if (interval >= NSEC_PER_SEC) { -		memset(irqs, 0, sizeof(*irqs)); -		irqs->last_ts = ts; +		irqs->count = 0;  		return;  	}  	/* -	 * Pre-compute the delta with the average as the result is -	 * used several times in this function. -	 */ -	diff = interval - irqs->avg; - -	/* -	 * Increment the number of samples. -	 */ -	irqs->nr_samples++; - -	/* -	 * Online variance divided by the number of elements if there -	 * is more than one sample.  Normally the formula is division -	 * by nr_samples - 1 but we assume the number of element will be -	 * more than 32 and dividing by 32 instead of 31 is enough -	 * precise. -	 */ -	if (likely(irqs->nr_samples > 1)) -		variance = irqs->variance >> IRQ_TIMINGS_SHIFT; - -	/* -	 * The rule of thumb in statistics for the normal distribution -	 * is having at least 30 samples in order to have the model to -	 * apply. Values outside the interval are considered as an -	 * anomaly. -	 */ -	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { -		/* -		 * After three consecutive anomalies, we reset the -		 * stats as it is no longer stable enough. -		 */ -		if (irqs->anomalies++ >= 3) { -			memset(irqs, 0, sizeof(*irqs)); -			irqs->last_ts = ts; -			return; -		} -	} else { -		/* -		 * The anomalies must be consecutives, so at this -		 * point, we reset the anomalies counter. -		 */ -		irqs->anomalies = 0; -	} - -	/* -	 * The interrupt is considered stable enough to try to predict -	 * the next event on it. +	 * Get the index in the ema table for this interrupt. The +	 * PREDICTION_FACTOR increase the interval size for the array +	 * of exponential average.  	 */ -	irqs->valid = 1; +	index = likely(interval) ? +		ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;  	/* -	 * Online average algorithm: -	 * -	 *  new_average = average + ((value - average) / count) -	 * -	 * The variance computation depends on the new average -	 * to be computed here first. -	 * +	 * Store the index as an element of the pattern in another +	 * circular array.  	 */ -	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); +	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; -	/* -	 * Online variance algorithm: -	 * -	 *  new_variance = variance + (value - average) x (value - new_average) -	 * -	 * Warning: irqs->avg is updated with the line above, hence -	 * 'interval - irqs->avg' is no longer equal to 'diff' -	 */ -	irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); +	irqs->ema_time[index] = irq_timings_ema_new(interval, +						    irqs->ema_time[index]); -	/* -	 * Update the next event -	 */ -	irqs->next_evt = ts + irqs->avg; +	irqs->count++;  }  /** @@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)  	 */  	lockdep_assert_irqs_disabled(); +	if (!irqts->count) +		return next_evt; +  	/*  	 * Number of elements in the circular buffer: If it happens it  	 * was flushed before, then the number of elements could be @@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)  	 * type but with the cost of extra computation in the  	 * interrupt handler hot path. We choose efficiency.  	 * -	 * Inject measured irq/timestamp to the statistical model -	 * while decrementing the counter because we consume the data -	 * from our circular buffer. +	 * Inject measured irq/timestamp to the pattern prediction +	 * model while decrementing the counter because we consume the +	 * data from our circular buffer.  	 */ -	for (i = irqts->count & IRQ_TIMINGS_MASK, -		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); -	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { -		irq = irq_timing_decode(irqts->values[i], &ts); +	i = (irqts->count & IRQ_TIMINGS_MASK) - 1; +	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); +	for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { +		irq = irq_timing_decode(irqts->values[i], &ts);  		s = idr_find(&irqt_stats, irq); -		if (s) { -			irqs = this_cpu_ptr(s); -			irqs_update(irqs, ts); -		} +		if (s) +			irq_timings_store(irq, this_cpu_ptr(s), ts);  	}  	/* @@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)  		irqs = this_cpu_ptr(s); -		if (!irqs->valid) -			continue; +		ts = __irq_timings_next_event(irqs, i, now); +		if (ts <= now) +			return now; -		if (irqs->next_evt <= now) { -			irq = i; -			next_evt = now; - -			/* -			 * This interrupt mustn't use in the future -			 * until new events occur and update the -			 * statistics. -			 */ -			irqs->valid = 0; -			break; -		} - -		if (irqs->next_evt < next_evt) { -			irq = i; -			next_evt = irqs->next_evt; -		} +		if (ts < next_evt) +			next_evt = ts;  	}  	return next_evt; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 6b7cdf17ccf8..d42acaf81886 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra   * @@ -56,61 +57,70 @@ void __weak arch_irq_work_raise(void)  	 */  } -/* - * Enqueue the irq_work @work on @cpu unless it's already pending - * somewhere. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue_on(struct irq_work *work, int cpu) +/* Enqueue on current CPU, work must already be claimed and preempt disabled */ +static void __irq_work_queue_local(struct irq_work *work)  { -	/* All work should have been flushed before going offline */ -	WARN_ON_ONCE(cpu_is_offline(cpu)); - -#ifdef CONFIG_SMP - -	/* Arch remote IPI send/receive backend aren't NMI safe */ -	WARN_ON_ONCE(in_nmi()); +	/* If the work is "lazy", handle it from next tick if any */ +	if (work->flags & IRQ_WORK_LAZY) { +		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && +		    tick_nohz_tick_stopped()) +			arch_irq_work_raise(); +	} else { +		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) +			arch_irq_work_raise(); +	} +} +/* Enqueue the irq work @work on the current CPU */ +bool irq_work_queue(struct irq_work *work) +{  	/* Only queue if not already pending */  	if (!irq_work_claim(work))  		return false; -	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) -		arch_send_call_function_single_ipi(cpu); - -#else /* #ifdef CONFIG_SMP */ -	irq_work_queue(work); -#endif /* #else #ifdef CONFIG_SMP */ +	/* Queue the entry and raise the IPI if needed. */ +	preempt_disable(); +	__irq_work_queue_local(work); +	preempt_enable();  	return true;  } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* Enqueue the irq work @work on the current CPU */ -bool irq_work_queue(struct irq_work *work) +/* + * Enqueue the irq_work @work on @cpu unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue_on(struct irq_work *work, int cpu)  { +#ifndef CONFIG_SMP +	return irq_work_queue(work); + +#else /* CONFIG_SMP: */ +	/* All work should have been flushed before going offline */ +	WARN_ON_ONCE(cpu_is_offline(cpu)); +  	/* Only queue if not already pending */  	if (!irq_work_claim(work))  		return false; -	/* Queue the entry and raise the IPI if needed. */  	preempt_disable(); - -	/* If the work is "lazy", handle it from next tick if any */ -	if (work->flags & IRQ_WORK_LAZY) { -		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && -		    tick_nohz_tick_stopped()) -			arch_irq_work_raise(); +	if (cpu != smp_processor_id()) { +		/* Arch remote IPI send/receive backend aren't NMI safe */ +		WARN_ON_ONCE(in_nmi()); +		if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) +			arch_send_call_function_single_ipi(cpu);  	} else { -		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) -			arch_irq_work_raise(); +		__irq_work_queue_local(work);  	} -  	preempt_enable();  	return true; +#endif /* CONFIG_SMP */  } -EXPORT_SYMBOL_GPL(irq_work_queue); +  bool irq_work_needs_cpu(void)  { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bad96b476eb6..0bfa10f4410c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * jump label support   * @@ -202,11 +203,13 @@ void static_key_disable(struct static_key *key)  }  EXPORT_SYMBOL_GPL(static_key_disable); -static void __static_key_slow_dec_cpuslocked(struct static_key *key, -					   unsigned long rate_limit, -					   struct delayed_work *work) +static bool static_key_slow_try_dec(struct static_key *key)  { -	lockdep_assert_cpus_held(); +	int val; + +	val = atomic_fetch_add_unless(&key->enabled, -1, 1); +	if (val == 1) +		return false;  	/*  	 * The negative count check is valid even when a negative @@ -215,63 +218,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,  	 * returns is unbalanced, because all other static_key_slow_inc()  	 * instances block while the update is in progress.  	 */ -	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { -		WARN(atomic_read(&key->enabled) < 0, -		     "jump label: negative count!\n"); +	WARN(val < 0, "jump label: negative count!\n"); +	return true; +} + +static void __static_key_slow_dec_cpuslocked(struct static_key *key) +{ +	lockdep_assert_cpus_held(); + +	if (static_key_slow_try_dec(key))  		return; -	} -	if (rate_limit) { -		atomic_inc(&key->enabled); -		schedule_delayed_work(work, rate_limit); -	} else { +	jump_label_lock(); +	if (atomic_dec_and_test(&key->enabled))  		jump_label_update(key); -	}  	jump_label_unlock();  } -static void __static_key_slow_dec(struct static_key *key, -				  unsigned long rate_limit, -				  struct delayed_work *work) +static void __static_key_slow_dec(struct static_key *key)  {  	cpus_read_lock(); -	__static_key_slow_dec_cpuslocked(key, rate_limit, work); +	__static_key_slow_dec_cpuslocked(key);  	cpus_read_unlock();  } -static void jump_label_update_timeout(struct work_struct *work) +void jump_label_update_timeout(struct work_struct *work)  {  	struct static_key_deferred *key =  		container_of(work, struct static_key_deferred, work.work); -	__static_key_slow_dec(&key->key, 0, NULL); +	__static_key_slow_dec(&key->key);  } +EXPORT_SYMBOL_GPL(jump_label_update_timeout);  void static_key_slow_dec(struct static_key *key)  {  	STATIC_KEY_CHECK_USE(key); -	__static_key_slow_dec(key, 0, NULL); +	__static_key_slow_dec(key);  }  EXPORT_SYMBOL_GPL(static_key_slow_dec);  void static_key_slow_dec_cpuslocked(struct static_key *key)  {  	STATIC_KEY_CHECK_USE(key); -	__static_key_slow_dec_cpuslocked(key, 0, NULL); +	__static_key_slow_dec_cpuslocked(key);  } -void static_key_slow_dec_deferred(struct static_key_deferred *key) +void __static_key_slow_dec_deferred(struct static_key *key, +				    struct delayed_work *work, +				    unsigned long timeout)  {  	STATIC_KEY_CHECK_USE(key); -	__static_key_slow_dec(&key->key, key->timeout, &key->work); + +	if (static_key_slow_try_dec(key)) +		return; + +	schedule_delayed_work(work, timeout);  } -EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); -void static_key_deferred_flush(struct static_key_deferred *key) +void __static_key_deferred_flush(void *key, struct delayed_work *work)  {  	STATIC_KEY_CHECK_USE(key); -	flush_delayed_work(&key->work); +	flush_delayed_work(work);  } -EXPORT_SYMBOL_GPL(static_key_deferred_flush); +EXPORT_SYMBOL_GPL(__static_key_deferred_flush);  void jump_label_rate_limit(struct static_key_deferred *key,  		unsigned long rl) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 14934afa9e68..95a260f9214b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.   * diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d7140447be75..fd5c95ff9251 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1150,7 +1150,7 @@ int kernel_kexec(void)  		error = dpm_suspend_end(PMSG_FREEZE);  		if (error)  			goto Resume_devices; -		error = disable_nonboot_cpus(); +		error = suspend_disable_secondary_cpus();  		if (error)  			goto Enable_cpus;  		local_irq_disable(); @@ -1183,7 +1183,7 @@ int kernel_kexec(void)   Enable_irqs:  		local_irq_enable();   Enable_cpus: -		enable_nonboot_cpus(); +		suspend_enable_secondary_cpus();  		dpm_resume_start(PMSG_RESTORE);   Resume_devices:  		dpm_resume_end(PMSG_RESTORE); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)  	return locate_mem_hole_bottom_up(start, end, kbuf);  } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, -			       int (*func)(struct resource *, void *)) -{ -	return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK  static int kexec_walk_memblock(struct kexec_buf *kbuf,  			       int (*func)(struct resource *, void *))  { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,  	return ret;  } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, +			       int (*func)(struct resource *, void *)) +{ +	return 0; +}  #endif  /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)  	if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)  		return 0; -	if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) +	if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))  		ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);  	else  		ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/uaccess.h> + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +"	.pushsection .rodata, \"a\"		\n" +"	.global kernel_headers_data		\n" +"kernel_headers_data:				\n" +"	.incbin \"kernel/kheaders_data.tar.xz\"	\n" +"	.global kernel_headers_data_end		\n" +"kernel_headers_data_end:			\n" +"	.popsection				\n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read_current(struct file *file, char __user *buf, +		      size_t len, loff_t *offset) +{ +	return simple_read_from_buffer(buf, len, offset, +				       &kernel_headers_data, +				       &kernel_headers_data_end - +				       &kernel_headers_data); +} + +static const struct file_operations ikheaders_file_ops = { +	.read = ikheaders_read_current, +	.llseek = default_llseek, +}; + +static int __init ikheaders_init(void) +{ +	struct proc_dir_entry *entry; + +	/* create the current headers file */ +	entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, +			    &ikheaders_file_ops); +	if (!entry) +		return -ENOMEM; + +	proc_set_size(entry, +		      &kernel_headers_data_end - +		      &kernel_headers_data); +	return 0; +} + +static void __exit ikheaders_cleanup(void) +{ +	remove_proc_entry("kheaders.tar.xz", NULL); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c83e54727131..445337c107e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   *  Kernel Probes (KProbes)   *  kernel/kprobes.c   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - *   * Copyright (C) IBM Corporation, 2002, 2004   *   * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel @@ -709,7 +696,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)  static int reuse_unused_kprobe(struct kprobe *ap)  {  	struct optimized_kprobe *op; -	int ret;  	/*  	 * Unused kprobe MUST be on the way of delayed unoptimizing (means @@ -720,9 +706,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)  	/* Enable the probe again */  	ap->flags &= ~KPROBE_FLAG_DISABLED;  	/* Optimize it again (remove from op->list) */ -	ret = kprobe_optready(ap); -	if (ret) -		return ret; +	if (!kprobe_optready(ap)) +		return -EINVAL;  	optimize_kprobe(ap);  	return 0; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 46ba853656f6..35859da8bd4f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which   * 		     are not related to any other subsystem   *   * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> - *  - * This file is release under the GPLv2 - *   */  #include <linux/kobject.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..621467c33fef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /* Kernel thread helper functions.   *   Copyright (C) 2004 IBM Corporation, Rusty Russell.   * @@ -11,6 +12,7 @@  #include <linux/kthread.h>  #include <linux/completion.h>  #include <linux/err.h> +#include <linux/cgroup.h>  #include <linux/cpuset.h>  #include <linux/unistd.h>  #include <linux/file.h> diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 96b4179cee6a..871734ea2f04 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -67,13 +67,10 @@ static struct latency_record latency_record[MAXLR];  int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p)  {  	unsigned long flags; -	if (!latencytop_enabled) -		return; -  	raw_spin_lock_irqsave(&latency_lock, flags);  	memset(&p->latency_record, 0, sizeof(p->latency_record));  	p->latency_record_count = 0; @@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk,  	int firstnonnull = MAXLR + 1;  	int i; -	if (!latencytop_enabled) -		return; -  	/* skip kernel threads for now */  	if (!tsk->mm)  		return; @@ -120,8 +114,8 @@ account_global_scheduler_latency(struct task_struct *tsk,  				break;  			} -			/* 0 and ULONG_MAX entries mean end of backtrace: */ -			if (record == 0 || record == ULONG_MAX) +			/* 0 entry marks end of backtrace: */ +			if (!record)  				break;  		}  		if (same) { @@ -141,20 +135,6 @@ account_global_scheduler_latency(struct task_struct *tsk,  	memcpy(&latency_record[i], lat, sizeof(struct latency_record));  } -/* - * Iterator to store a backtrace into a latency record entry - */ -static inline void store_stacktrace(struct task_struct *tsk, -					struct latency_record *lat) -{ -	struct stack_trace trace; - -	memset(&trace, 0, sizeof(trace)); -	trace.max_entries = LT_BACKTRACEDEPTH; -	trace.entries = &lat->backtrace[0]; -	save_stack_trace_tsk(tsk, &trace); -} -  /**   * __account_scheduler_latency - record an occurred latency   * @tsk - the task struct of the task hitting the latency @@ -191,7 +171,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  	lat.count = 1;  	lat.time = usecs;  	lat.max = usecs; -	store_stacktrace(tsk, &lat); + +	stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);  	raw_spin_lock_irqsave(&latency_lock, flags); @@ -210,8 +191,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  				break;  			} -			/* 0 and ULONG_MAX entries mean end of backtrace: */ -			if (record == 0 || record == ULONG_MAX) +			/* 0 entry is end of backtrace */ +			if (!record)  				break;  		}  		if (same) { @@ -252,10 +233,10 @@ static int lstats_show(struct seq_file *m, void *v)  				   lr->count, lr->time, lr->max);  			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {  				unsigned long bt = lr->backtrace[q]; +  				if (!bt)  					break; -				if (bt == ULONG_MAX) -					break; +  				seq_printf(m, " %ps", (void *)bt);  			}  			seq_puts(m, "\n"); diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index ec4565122e65..54102deb50ba 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  config HAVE_LIVEPATCH  	bool  	help diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index b36ceda6488e..cf9b5bcdb952 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  obj-$(CONFIG_LIVEPATCH) += livepatch.o  livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..2398832947c6 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * core.c - Kernel Live Patching Core   *   * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>   * Copyright (C) 2014 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -419,6 +407,7 @@ static struct attribute *klp_patch_attrs[] = {  	&force_kobj_attr.attr,  	NULL  }; +ATTRIBUTE_GROUPS(klp_patch);  static void klp_free_object_dynamic(struct klp_object *obj)  { @@ -426,7 +415,13 @@ static void klp_free_object_dynamic(struct klp_object *obj)  	kfree(obj);  } -static struct klp_object *klp_alloc_object_dynamic(const char *name) +static void klp_init_func_early(struct klp_object *obj, +				struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, +				  struct klp_object *obj); + +static struct klp_object *klp_alloc_object_dynamic(const char *name, +						   struct klp_patch *patch)  {  	struct klp_object *obj; @@ -442,7 +437,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name)  		}  	} -	INIT_LIST_HEAD(&obj->func_list); +	klp_init_object_early(patch, obj);  	obj->dynamic = true;  	return obj; @@ -471,6 +466,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,  		}  	} +	klp_init_func_early(obj, func);  	/*  	 * func->new_func is same as func->old_func. These addresses are  	 * set when the object is loaded, see klp_init_object_loaded(). @@ -490,11 +486,9 @@ static int klp_add_object_nops(struct klp_patch *patch,  	obj = klp_find_object(patch, old_obj);  	if (!obj) { -		obj = klp_alloc_object_dynamic(old_obj->name); +		obj = klp_alloc_object_dynamic(old_obj->name, patch);  		if (!obj)  			return -ENOMEM; - -		list_add_tail(&obj->node, &patch->obj_list);  	}  	klp_for_each_func(old_obj, old_func) { @@ -505,8 +499,6 @@ static int klp_add_object_nops(struct klp_patch *patch,  		func = klp_alloc_func_nop(old_func, obj);  		if (!func)  			return -ENOMEM; - -		list_add_tail(&func->node, &obj->func_list);  	}  	return 0; @@ -546,7 +538,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)  static struct kobj_type klp_ktype_patch = {  	.release = klp_kobj_release_patch,  	.sysfs_ops = &kobj_sysfs_ops, -	.default_attrs = klp_patch_attrs, +	.default_groups = klp_patch_groups,  };  static void klp_kobj_release_object(struct kobject *kobj) @@ -588,13 +580,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only)  			continue;  		list_del(&func->node); - -		/* Might be called from klp_init_patch() error path. */ -		if (func->kobj_added) { -			kobject_put(&func->kobj); -		} else if (func->nop) { -			klp_free_func_nop(func); -		} +		kobject_put(&func->kobj);  	}  } @@ -624,13 +610,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only)  			continue;  		list_del(&obj->node); - -		/* Might be called from klp_init_patch() error path. */ -		if (obj->kobj_added) { -			kobject_put(&obj->kobj); -		} else if (obj->dynamic) { -			klp_free_object_dynamic(obj); -		} +		kobject_put(&obj->kobj);  	}  } @@ -675,10 +655,8 @@ static void klp_free_patch_finish(struct klp_patch *patch)  	 * this is called when the patch gets disabled and it  	 * cannot get enabled again.  	 */ -	if (patch->kobj_added) { -		kobject_put(&patch->kobj); -		wait_for_completion(&patch->finish); -	} +	kobject_put(&patch->kobj); +	wait_for_completion(&patch->finish);  	/* Put the module after the last access to struct klp_patch. */  	if (!patch->forced) @@ -700,8 +678,6 @@ static void klp_free_patch_work_fn(struct work_struct *work)  static int klp_init_func(struct klp_object *obj, struct klp_func *func)  { -	int ret; -  	if (!func->old_name)  		return -EINVAL; @@ -724,13 +700,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)  	 * object. If the user selects 0 for old_sympos, then 1 will be used  	 * since a unique symbol will be the first occurrence.  	 */ -	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, -				   &obj->kobj, "%s,%lu", func->old_name, -				   func->old_sympos ? func->old_sympos : 1); -	if (!ret) -		func->kobj_added = true; - -	return ret; +	return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", +			   func->old_name, +			   func->old_sympos ? func->old_sympos : 1);  }  /* Arches may override this to finish any remaining arch-specific tasks */ @@ -801,11 +773,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)  	klp_find_object_module(obj);  	name = klp_is_module(obj) ? obj->name : "vmlinux"; -	ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, -				   &patch->kobj, "%s", name); +	ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);  	if (ret)  		return ret; -	obj->kobj_added = true;  	klp_for_each_func(obj, func) {  		ret = klp_init_func(obj, func); @@ -819,6 +789,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)  	return ret;  } +static void klp_init_func_early(struct klp_object *obj, +				struct klp_func *func) +{ +	kobject_init(&func->kobj, &klp_ktype_func); +	list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, +				  struct klp_object *obj) +{ +	INIT_LIST_HEAD(&obj->func_list); +	kobject_init(&obj->kobj, &klp_ktype_object); +	list_add_tail(&obj->node, &patch->obj_list); +} +  static int klp_init_patch_early(struct klp_patch *patch)  {  	struct klp_object *obj; @@ -829,7 +814,7 @@ static int klp_init_patch_early(struct klp_patch *patch)  	INIT_LIST_HEAD(&patch->list);  	INIT_LIST_HEAD(&patch->obj_list); -	patch->kobj_added = false; +	kobject_init(&patch->kobj, &klp_ktype_patch);  	patch->enabled = false;  	patch->forced = false;  	INIT_WORK(&patch->free_work, klp_free_patch_work_fn); @@ -839,13 +824,10 @@ static int klp_init_patch_early(struct klp_patch *patch)  		if (!obj->funcs)  			return -EINVAL; -		INIT_LIST_HEAD(&obj->func_list); -		obj->kobj_added = false; -		list_add_tail(&obj->node, &patch->obj_list); +		klp_init_object_early(patch, obj);  		klp_for_each_func_static(obj, func) { -			func->kobj_added = false; -			list_add_tail(&func->node, &obj->func_list); +			klp_init_func_early(obj, func);  		}  	} @@ -860,11 +842,9 @@ static int klp_init_patch(struct klp_patch *patch)  	struct klp_object *obj;  	int ret; -	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, -				   klp_root_kobj, "%s", patch->mod->name); +	ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);  	if (ret)  		return ret; -	patch->kobj_added = true;  	if (patch->replace) {  		ret = klp_add_nops(patch); @@ -926,9 +906,6 @@ static int __klp_enable_patch(struct klp_patch *patch)  	if (WARN_ON(patch->enabled))  		return -EINVAL; -	if (!patch->kobj_added) -		return -EINVAL; -  	pr_notice("enabling patch '%s'\n", patch->mod->name);  	klp_init_transition(patch, KLP_PATCHED); @@ -1003,11 +980,10 @@ int klp_enable_patch(struct klp_patch *patch)  		return -ENODEV;  	if (!klp_have_reliable_stack()) { -		pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); -		return -EOPNOTSUPP; +		pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); +		pr_warn("The livepatch transition may never complete.\n");  	} -  	mutex_lock(&klp_mutex);  	ret = klp_init_patch_early(patch); @@ -1220,14 +1196,6 @@ void klp_module_going(struct module *mod)  static int __init klp_init(void)  { -	int ret; - -	ret = klp_check_compiler_support(); -	if (ret) { -		pr_info("Your compiler is too old; turning off.\n"); -		return -EINVAL; -	} -  	klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);  	if (!klp_root_kobj)  		return -ENOMEM; diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 99cb3ad05eb4..bd43537702bd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * patch.c - livepatch patching functions   *   * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>   * Copyright (C) 2014 SUSE   * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 83958c814439..e5c9fb295ba9 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * shadow.c - Shadow Variables   *   * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>   * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>   * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  /** diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 9c89ae8b337a..abb2a4a2cbb2 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * transition.c - Kernel Live Patching transition functions   *   * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -202,15 +190,15 @@ void klp_update_patch_state(struct task_struct *task)   * Determine whether the given stack trace includes any references to a   * to-be-patched or to-be-unpatched function.   */ -static int klp_check_stack_func(struct klp_func *func, -				struct stack_trace *trace) +static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, +				unsigned int nr_entries)  {  	unsigned long func_addr, func_size, address;  	struct klp_ops *ops;  	int i; -	for (i = 0; i < trace->nr_entries; i++) { -		address = trace->entries[i]; +	for (i = 0; i < nr_entries; i++) { +		address = entries[i];  		if (klp_target_state == KLP_UNPATCHED) {  			 /* @@ -254,29 +242,25 @@ static int klp_check_stack_func(struct klp_func *func,  static int klp_check_stack(struct task_struct *task, char *err_buf)  {  	static unsigned long entries[MAX_STACK_ENTRIES]; -	struct stack_trace trace;  	struct klp_object *obj;  	struct klp_func *func; -	int ret; +	int ret, nr_entries; -	trace.skip = 0; -	trace.nr_entries = 0; -	trace.max_entries = MAX_STACK_ENTRIES; -	trace.entries = entries; -	ret = save_stack_trace_tsk_reliable(task, &trace); +	ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));  	WARN_ON_ONCE(ret == -ENOSYS); -	if (ret) { +	if (ret < 0) {  		snprintf(err_buf, STACK_ERR_BUF_SIZE,  			 "%s: %s:%d has an unreliable stack\n",  			 __func__, task->comm, task->pid);  		return ret;  	} +	nr_entries = ret;  	klp_for_each_object(klp_transition_patch, obj) {  		if (!obj->patched)  			continue;  		klp_for_each_func(obj, func) { -			ret = klp_check_stack_func(func, &trace); +			ret = klp_check_stack_func(func, entries, nr_entries);  			if (ret) {  				snprintf(err_buf, STACK_ERR_BUF_SIZE,  					 "%s: %s:%d is sleeping on function %s\n", diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@  # and is generally not a function of system call inputs.  KCOV_INSTRUMENT		:= n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o  ifdef CONFIG_FUNCTION_TRACER  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o  obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..fa2c2f951c6b --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * Collect locking event counts + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/fs.h> + +#include "lock_events.h" + +#undef  LOCK_EVENT +#define LOCK_EVENT(name)	[LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR		"lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the <debugfs>/lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + +	[LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, +			      size_t count, loff_t *ppos) +{ +	char buf[64]; +	int cpu, id, len; +	u64 sum = 0; + +	/* +	 * Get the counter ID stored in file->f_inode->i_private +	 */ +	id = (long)file_inode(file)->i_private; + +	if (id >= lockevent_num) +		return -EBADF; + +	for_each_possible_cpu(cpu) +		sum += per_cpu(lockevents[id], cpu); +	len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + +	return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, +			   size_t count, loff_t *ppos) +{ +	int cpu; + +	/* +	 * Get the counter ID stored in file->f_inode->i_private +	 */ +	if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) +		return count; + +	for_each_possible_cpu(cpu) { +		int i; +		unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + +		for (i = 0 ; i < lockevent_num; i++) +			WRITE_ONCE(ptr[i], 0); +	} +	return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { +	.read = lockevent_read, +	.write = lockevent_write, +	.llseek = default_llseek, +}; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include <asm/paravirt.h> + +static bool __init skip_lockevent(const char *name) +{ +	static int pv_on __initdata = -1; + +	if (pv_on < 0) +		pv_on = !pv_is_native_spin_unlock(); +	/* +	 * Skip PV qspinlock events on bare metal. +	 */ +	if (!pv_on && !memcmp(name, "pv_", 3)) +		return true; +	return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ +	return false; +} +#endif + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ +	struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); +	int i; + +	if (!d_counts) +		goto out; + +	/* +	 * Create the debugfs files +	 * +	 * As reading from and writing to the stat files can be slow, only +	 * root is allowed to do the read/write to limit impact to system +	 * performance. +	 */ +	for (i = 0; i < lockevent_num; i++) { +		if (skip_lockevent(lockevent_names[i])) +			continue; +		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, +					 (void *)(long)i, &fops_lockevent)) +			goto fail_undo; +	} + +	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, +				 d_counts, (void *)(long)LOCKEVENT_reset_cnts, +				 &fops_lockevent)) +		goto fail_undo; + +	return 0; +fail_undo: +	debugfs_remove_recursive(d_counts); +out: +	pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); +	return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..46b71af8eef2 --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + +enum lock_events { + +#include "lock_events_list.h" + +	lockevent_num,	/* Total number of lock event counts */ +	LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_LOCK_EVENT_COUNTS +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The purpose of the lock event counting subsystem is to provide a low + * overhead way to record the number of specific locking events by using + * percpu counters. It is the percpu sum that matters, not specifically + * how many of them happens in each cpu. + * + * It is possible that the same percpu counter may be modified in both + * the process and interrupt contexts. For architectures that perform + * percpu operation with multiple instructions, it is possible to lose + * count if a process context percpu update is interrupted in the middle + * and the same counter is updated in the interrupt context. Therefore, + * the generated percpu sum may not be precise. The error, if any, should + * be small and insignificant. + * + * For those architectures that do multi-instruction percpu operation, + * preemption in the middle and moving the task to another cpu may cause + * a larger error in the count. Again, this will be few and far between. + * Given the imprecise nature of the count and the possibility of resetting + * the count and doing the measurement again, this is not really a big + * problem. + * + * To get a better picture of what is happening under the hood, it is + * suggested that a few measurements should be taken with the counts + * reset in between to stamp out outliner because of these possible + * error conditions. + * + * To minimize overhead, we use __this_cpu_*() in all cases except when + * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() + * will be used to avoid the appearance of unwanted BUG messages. + */ +#ifdef CONFIG_DEBUG_PREEMPT +#define lockevent_percpu_inc(x)		this_cpu_inc(x) +#define lockevent_percpu_add(x, v)	this_cpu_add(x, v) +#else +#define lockevent_percpu_inc(x)		__this_cpu_inc(x) +#define lockevent_percpu_add(x, v)	__this_cpu_add(x, v) +#endif + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ +	if (cond) +		lockevent_percpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev)	  __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ +	lockevent_percpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c)	__lockevent_add(LOCKEVENT_ ##ev, c) + +#else  /* CONFIG_LOCK_EVENT_COUNTS */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) +#define lockevent_cond_inc(ev, c) + +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..ad7668cfc9da --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name)	LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops)	/* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock)	/* # of vCPU kicks issued at unlock time   */ +LOCK_EVENT(pv_kick_wake)	/* # of vCPU kicks for pv_latency_wake	   */ +LOCK_EVENT(pv_latency_kick)	/* Average latency (ns) of vCPU kick	   */ +LOCK_EVENT(pv_latency_wake)	/* Average latency (ns) of kick-to-wakeup  */ +LOCK_EVENT(pv_lock_stealing)	/* # of lock stealing operations	   */ +LOCK_EVENT(pv_spurious_wakeup)	/* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again)	/* # of wait's after queue head vCPU kick  */ +LOCK_EVENT(pv_wait_early)	/* # of early vCPU wait's		   */ +LOCK_EVENT(pv_wait_head)	/* # of vCPU wait's at the queue head	   */ +LOCK_EVENT(pv_wait_node)	/* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending)	/* # of locking ops via pending code	     */ +LOCK_EVENT(lock_slowpath)	/* # of locking ops via MCS lock queue	     */ +LOCK_EVENT(lock_use_node2)	/* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3)	/* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4)	/* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node)	/* # of locking ops w/o using percpu node    */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader)	/* # of reader sleeps			*/ +LOCK_EVENT(rwsem_sleep_writer)	/* # of writer sleeps			*/ +LOCK_EVENT(rwsem_wake_reader)	/* # of reader wakeups			*/ +LOCK_EVENT(rwsem_wake_writer)	/* # of writer wakeups			*/ +LOCK_EVENT(rwsem_opt_wlock)	/* # of write locks opt-spin acquired	*/ +LOCK_EVENT(rwsem_opt_fail)	/* # of failed opt-spinnings		*/ +LOCK_EVENT(rwsem_rlock)		/* # of read locks acquired		*/ +LOCK_EVENT(rwsem_rlock_fast)	/* # of fast read locks acquired	*/ +LOCK_EVENT(rwsem_rlock_fail)	/* # of failed read lock acquisitions	*/ +LOCK_EVENT(rwsem_rtrylock)	/* # of read trylock calls		*/ +LOCK_EVENT(rwsem_wlock)		/* # of write locks acquired		*/ +LOCK_EVENT(rwsem_wlock_fail)	/* # of failed write lock acquisitions	*/ +LOCK_EVENT(rwsem_wtrylock)	/* # of write trylock calls		*/ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 34cdcbedda49..c47788fa85f9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kernel/lockdep.c   * @@ -434,29 +435,14 @@ static void print_lockdep_off(const char *bug_msg)  #endif  } -static int save_trace(struct stack_trace *trace) +static int save_trace(struct lock_trace *trace)  { -	trace->nr_entries = 0; -	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; -	trace->entries = stack_trace + nr_stack_trace_entries; - -	trace->skip = 3; - -	save_stack_trace(trace); - -	/* -	 * Some daft arches put -1 at the end to indicate its a full trace. -	 * -	 * <rant> this is buggy anyway, since it takes a whole extra entry so a -	 * complete trace that maxes out the entries provided will be reported -	 * as incomplete, friggin useless </rant> -	 */ -	if (trace->nr_entries != 0 && -	    trace->entries[trace->nr_entries-1] == ULONG_MAX) -		trace->nr_entries--; - -	trace->max_entries = trace->nr_entries; +	unsigned long *entries = stack_trace + nr_stack_trace_entries; +	unsigned int max_entries; +	trace->offset = nr_stack_trace_entries; +	max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; +	trace->nr_entries = stack_trace_save(entries, max_entries, 3);  	nr_stack_trace_entries += trace->nr_entries;  	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { @@ -516,11 +502,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)  {  	char c = '.'; -	if (class->usage_mask & lock_flag(bit + 2)) +	if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))  		c = '+';  	if (class->usage_mask & lock_flag(bit)) {  		c = '-'; -		if (class->usage_mask & lock_flag(bit + 2)) +		if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))  			c = '?';  	} @@ -649,6 +635,9 @@ static int static_obj(const void *obj)  		      end   = (unsigned long) &_end,  		      addr  = (unsigned long) obj; +	if (arch_is_kernel_initmem_freed(addr)) +		return 0; +  	/*  	 * static variable?  	 */ @@ -1207,7 +1196,7 @@ static struct lock_list *alloc_list_entry(void)  static int add_lock_to_list(struct lock_class *this,  			    struct lock_class *links_to, struct list_head *head,  			    unsigned long ip, int distance, -			    struct stack_trace *trace) +			    struct lock_trace *trace)  {  	struct lock_list *entry;  	/* @@ -1426,6 +1415,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,   * checking.   */ +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +	unsigned long *entries = stack_trace + trace->offset; + +	stack_trace_print(entries, trace->nr_entries, spaces); +} +  /*   * Print a dependency chain entry (this is only done when a deadlock   * has been detected): @@ -1438,8 +1434,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)  	printk("\n-> #%u", depth);  	print_lock_name(target->class);  	printk(KERN_CONT ":\n"); -	print_stack_trace(&target->trace, 6); - +	print_lock_trace(&target->trace, 6);  	return 0;  } @@ -1533,10 +1528,9 @@ static inline int class_equal(struct lock_list *entry, void *data)  }  static noinline int print_circular_bug(struct lock_list *this, -				struct lock_list *target, -				struct held_lock *check_src, -				struct held_lock *check_tgt, -				struct stack_trace *trace) +				       struct lock_list *target, +				       struct held_lock *check_src, +				       struct held_lock *check_tgt)  {  	struct task_struct *curr = current;  	struct lock_list *parent; @@ -1676,19 +1670,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,  }  #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +static inline int usage_accumulate(struct lock_list *entry, void *mask) +{ +	*(unsigned long *)mask |= entry->class->usage_mask; + +	return 0; +} +  /*   * Forwards and backwards subgraph searching, for the purposes of   * proving that two subgraphs can be connected by a new dependency   * without creating any illegal irq-safe -> irq-unsafe lock dependency.   */ -static inline int usage_match(struct lock_list *entry, void *bit) +static inline int usage_match(struct lock_list *entry, void *mask)  { -	return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +	return entry->class->usage_mask & *(unsigned long *)mask;  } - -  /*   * Find a node in the forwards-direction dependency sub-graph starting   * at @root->class that matches @bit. @@ -1700,14 +1700,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)   * Return <0 on error.   */  static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_forwards(struct lock_list *root, unsigned long usage_mask,  			struct lock_list **target_entry)  {  	int result;  	debug_atomic_inc(nr_find_usage_forwards_checks); -	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); +	result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);  	return result;  } @@ -1723,14 +1723,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,   * Return <0 on error.   */  static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_backwards(struct lock_list *root, unsigned long usage_mask,  			struct lock_list **target_entry)  {  	int result;  	debug_atomic_inc(nr_find_usage_backwards_checks); -	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); +	result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);  	return result;  } @@ -1752,7 +1752,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)  			len += printk("%*s   %s", depth, "", usage_str[bit]);  			len += printk(KERN_CONT " at:\n"); -			print_stack_trace(class->usage_traces + bit, len); +			print_lock_trace(class->usage_traces + bit, len);  		}  	}  	printk("%*s }\n", depth, ""); @@ -1777,7 +1777,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,  	do {  		print_lock_class_header(entry->class, depth);  		printk("%*s ... acquired at:\n", depth, ""); -		print_stack_trace(&entry->trace, 2); +		print_lock_trace(&entry->trace, 2);  		printk("\n");  		if (depth == 0 && (entry != root)) { @@ -1890,14 +1890,14 @@ print_bad_irq_dependency(struct task_struct *curr,  	print_lock_name(backwards_entry->class);  	pr_warn("\n... which became %s-irq-safe at:\n", irqclass); -	print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); +	print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);  	pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);  	print_lock_name(forwards_entry->class);  	pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);  	pr_warn("..."); -	print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); +	print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);  	pr_warn("\nother info that might help us debug this:\n\n");  	print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -1922,39 +1922,6 @@ print_bad_irq_dependency(struct task_struct *curr,  	return 0;  } -static int -check_usage(struct task_struct *curr, struct held_lock *prev, -	    struct held_lock *next, enum lock_usage_bit bit_backwards, -	    enum lock_usage_bit bit_forwards, const char *irqclass) -{ -	int ret; -	struct lock_list this, that; -	struct lock_list *uninitialized_var(target_entry); -	struct lock_list *uninitialized_var(target_entry1); - -	this.parent = NULL; - -	this.class = hlock_class(prev); -	ret = find_usage_backwards(&this, bit_backwards, &target_entry); -	if (ret < 0) -		return print_bfs_bug(ret); -	if (ret == 1) -		return ret; - -	that.parent = NULL; -	that.class = hlock_class(next); -	ret = find_usage_forwards(&that, bit_forwards, &target_entry1); -	if (ret < 0) -		return print_bfs_bug(ret); -	if (ret == 1) -		return ret; - -	return print_bad_irq_dependency(curr, &this, &that, -			target_entry, target_entry1, -			prev, next, -			bit_backwards, bit_forwards, irqclass); -} -  static const char *state_names[] = {  #define LOCKDEP_STATE(__STATE) \  	__stringify(__STATE), @@ -1971,9 +1938,19 @@ static const char *state_rnames[] = {  static inline const char *state_name(enum lock_usage_bit bit)  { -	return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +	if (bit & LOCK_USAGE_READ_MASK) +		return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; +	else +		return state_names[bit >> LOCK_USAGE_DIR_MASK];  } +/* + * The bit number is encoded like: + * + *  bit0: 0 exclusive, 1 read lock + *  bit1: 0 used in irq, 1 irq enabled + *  bit2-n: state + */  static int exclusive_bit(int new_bit)  {  	int state = new_bit & LOCK_USAGE_STATE_MASK; @@ -1985,45 +1962,160 @@ static int exclusive_bit(int new_bit)  	return state | (dir ^ LOCK_USAGE_DIR_MASK);  } +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ +	unsigned long excl = 0; + +	/* Invert dir */ +	excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; +	excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + +	return excl; +} + +/* + * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all + * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). + * And then mask out all bitnr0. + */ +static unsigned long exclusive_mask(unsigned long mask) +{ +	unsigned long excl = invert_dir_mask(mask); + +	/* Strip read */ +	excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; +	excl &= ~LOCKF_IRQ_READ; + +	return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ +	unsigned long excl = invert_dir_mask(mask); + +	/* Include read in existing usages */ +	excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + +	return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, +				unsigned long excl_mask, +				enum lock_usage_bit *bitp, +				enum lock_usage_bit *excl_bitp) +{ +	int bit, excl; + +	for_each_set_bit(bit, &mask, LOCK_USED) { +		excl = exclusive_bit(bit); +		if (excl_mask & lock_flag(excl)) { +			*bitp = bit; +			*excl_bitp = excl; +			return 0; +		} +	} +	return -1; +} + +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */  static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, -			   struct held_lock *next, enum lock_usage_bit bit) +			   struct held_lock *next)  { +	unsigned long usage_mask = 0, forward_mask, backward_mask; +	enum lock_usage_bit forward_bit = 0, backward_bit = 0; +	struct lock_list *uninitialized_var(target_entry1); +	struct lock_list *uninitialized_var(target_entry); +	struct lock_list this, that; +	int ret; +  	/* -	 * Prove that the new dependency does not connect a hardirq-safe -	 * lock with a hardirq-unsafe lock - to achieve this we search -	 * the backwards-subgraph starting at <prev>, and the -	 * forwards-subgraph starting at <next>: +	 * Step 1: gather all hard/soft IRQs usages backward in an +	 * accumulated usage mask.  	 */ -	if (!check_usage(curr, prev, next, bit, -			   exclusive_bit(bit), state_name(bit))) -		return 0; +	this.parent = NULL; +	this.class = hlock_class(prev); + +	ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); +	if (ret < 0) +		return print_bfs_bug(ret); -	bit++; /* _READ */ +	usage_mask &= LOCKF_USED_IN_IRQ_ALL; +	if (!usage_mask) +		return 1;  	/* -	 * Prove that the new dependency does not connect a hardirq-safe-read -	 * lock with a hardirq-unsafe lock - to achieve this we search -	 * the backwards-subgraph starting at <prev>, and the -	 * forwards-subgraph starting at <next>: +	 * Step 2: find exclusive uses forward that match the previous +	 * backward accumulated mask.  	 */ -	if (!check_usage(curr, prev, next, bit, -			   exclusive_bit(bit), state_name(bit))) -		return 0; +	forward_mask = exclusive_mask(usage_mask); -	return 1; -} +	that.parent = NULL; +	that.class = hlock_class(next); -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, -		struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE)						\ -	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\ -		return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE +	ret = find_usage_forwards(&that, forward_mask, &target_entry1); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (ret == 1) +		return ret; -	return 1; +	/* +	 * Step 3: we found a bad match! Now retrieve a lock from the backward +	 * list whose usage mask matches the exclusive usage mask from the +	 * lock found on the forward list. +	 */ +	backward_mask = original_mask(target_entry1->class->usage_mask); + +	ret = find_usage_backwards(&this, backward_mask, &target_entry); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (DEBUG_LOCKS_WARN_ON(ret == 1)) +		return 1; + +	/* +	 * Step 4: narrow down to a pair of incompatible usage bits +	 * and report it. +	 */ +	ret = find_exclusive_match(target_entry->class->usage_mask, +				   target_entry1->class->usage_mask, +				   &backward_bit, &forward_bit); +	if (DEBUG_LOCKS_WARN_ON(ret == -1)) +		return 1; + +	return print_bad_irq_dependency(curr, &this, &that, +			target_entry, target_entry1, +			prev, next, +			backward_bit, forward_bit, +			state_name(backward_bit));  }  static void inc_chains(void) @@ -2040,9 +2132,8 @@ static void inc_chains(void)  #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, -		struct held_lock *next) +static inline int check_irq_usage(struct task_struct *curr, +				  struct held_lock *prev, struct held_lock *next)  {  	return 1;  } @@ -2170,8 +2261,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,   */  static int  check_prev_add(struct task_struct *curr, struct held_lock *prev, -	       struct held_lock *next, int distance, struct stack_trace *trace, -	       int (*save)(struct stack_trace *trace)) +	       struct held_lock *next, int distance, struct lock_trace *trace)  {  	struct lock_list *uninitialized_var(target_entry);  	struct lock_list *entry; @@ -2209,20 +2299,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	this.parent = NULL;  	ret = check_noncircular(&this, hlock_class(prev), &target_entry);  	if (unlikely(!ret)) { -		if (!trace->entries) { +		if (!trace->nr_entries) {  			/* -			 * If @save fails here, the printing might trigger -			 * a WARN but because of the !nr_entries it should -			 * not do bad things. +			 * If save_trace fails here, the printing might +			 * trigger a WARN but because of the !nr_entries it +			 * should not do bad things.  			 */ -			save(trace); +			save_trace(trace);  		} -		return print_circular_bug(&this, target_entry, next, prev, trace); +		return print_circular_bug(&this, target_entry, next, prev);  	}  	else if (unlikely(ret < 0))  		return print_bfs_bug(ret); -	if (!check_prev_add_irq(curr, prev, next)) +	if (!check_irq_usage(curr, prev, next))  		return 0;  	/* @@ -2265,7 +2355,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		return print_bfs_bug(ret); -	if (!trace->entries && !save(trace)) +	if (!trace->nr_entries && !save_trace(trace))  		return 0;  	/* @@ -2297,14 +2387,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  static int  check_prevs_add(struct task_struct *curr, struct held_lock *next)  { +	struct lock_trace trace = { .nr_entries = 0 };  	int depth = curr->lockdep_depth;  	struct held_lock *hlock; -	struct stack_trace trace = { -		.nr_entries = 0, -		.max_entries = 0, -		.entries = NULL, -		.skip = 0, -	};  	/*  	 * Debugging checks. @@ -2330,7 +2415,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)  		 * added:  		 */  		if (hlock->read != 2 && hlock->check) { -			int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); +			int ret = check_prev_add(curr, hlock, next, distance, +						 &trace);  			if (!ret)  				return 0; @@ -2731,6 +2817,10 @@ static inline int validate_chain(struct task_struct *curr,  {  	return 1;  } + +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +}  #endif  /* @@ -2784,6 +2874,12 @@ static void check_chain_key(struct task_struct *curr)  #endif  } +static int mark_lock(struct task_struct *curr, struct held_lock *this, +		     enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +  static void  print_usage_bug_scenario(struct held_lock *lock)  { @@ -2827,7 +2923,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,  	print_lock(this);  	pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); -	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); +	print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);  	print_irqtrace_events(curr);  	pr_warn("\nother info that might help us debug this:\n"); @@ -2853,10 +2949,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,  	return 1;  } -static int mark_lock(struct task_struct *curr, struct held_lock *this, -		     enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)  /*   * print irq inversion bug: @@ -2936,7 +3028,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,  	root.parent = NULL;  	root.class = hlock_class(this); -	ret = find_usage_forwards(&root, bit, &target_entry); +	ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);  	if (ret < 0)  		return print_bfs_bug(ret);  	if (ret == 1) @@ -2960,7 +3052,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,  	root.parent = NULL;  	root.class = hlock_class(this); -	ret = find_usage_backwards(&root, bit, &target_entry); +	ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);  	if (ret < 0)  		return print_bfs_bug(ret);  	if (ret == 1) @@ -3015,7 +3107,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {  static inline int state_verbose(enum lock_usage_bit bit,  				struct lock_class *class)  { -	return state_verbose_f[bit >> 2](class); +	return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);  }  typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, @@ -3157,7 +3249,7 @@ void lockdep_hardirqs_on(unsigned long ip)  	/*  	 * See the fine text that goes along with this variable definition.  	 */ -	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) +	if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))  		return;  	/* @@ -4689,8 +4781,8 @@ static void free_zapped_rcu(struct rcu_head *ch)  		return;  	raw_local_irq_save(flags); -	if (!graph_lock()) -		goto out_irq; +	arch_spin_lock(&lockdep_lock); +	current->lockdep_recursion = 1;  	/* closed head */  	pf = delayed_free.pf + (delayed_free.index ^ 1); @@ -4702,8 +4794,8 @@ static void free_zapped_rcu(struct rcu_head *ch)  	 */  	call_rcu_zapped(delayed_free.pf + delayed_free.index); -	graph_unlock(); -out_irq: +	current->lockdep_recursion = 0; +	arch_spin_unlock(&lockdep_lock);  	raw_local_irq_restore(flags);  } @@ -4744,21 +4836,17 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)  {  	struct pending_free *pf;  	unsigned long flags; -	int locked;  	init_data_structures_once();  	raw_local_irq_save(flags); -	locked = graph_lock(); -	if (!locked) -		goto out_irq; - +	arch_spin_lock(&lockdep_lock); +	current->lockdep_recursion = 1;  	pf = get_pending_free();  	__lockdep_free_key_range(pf, start, size);  	call_rcu_zapped(pf); - -	graph_unlock(); -out_irq: +	current->lockdep_recursion = 0; +	arch_spin_unlock(&lockdep_lock);  	raw_local_irq_restore(flags);  	/* diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d4c197425f68..150ec3f0c5b5 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -42,13 +42,35 @@ enum {  	__LOCKF(USED)  }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +#define LOCKDEP_STATE(__STATE)	LOCKF_ENABLED_##__STATE | +static const unsigned long LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE)	LOCKF_USED_IN_##__STATE | +static const unsigned long LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE)	LOCKF_ENABLED_##__STATE##_READ | +static const unsigned long LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE)	LOCKF_USED_IN_##__STATE##_READ | +static const unsigned long LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) -#define LOCKF_ENABLED_IRQ_READ \ -		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ -		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)  /*   * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ad40a2617063..80a463d31a8d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)  						"End of test: SUCCESS");  	kfree(cxt.lwsa); +	cxt.lwsa = NULL;  	kfree(cxt.lrsa); +	cxt.lrsa = NULL;  end:  	torture_cleanup_end(); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db578783dd36..0c601ae072b3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kernel/locking/mutex.c   * diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..b6a9cc62099a 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #include <linux/atomic.h>  #include <linux/rwsem.h>  #include <linux/percpu.h> @@ -7,6 +8,8 @@  #include <linux/sched.h>  #include <linux/errno.h> +#include "rwsem.h" +  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,  			const char *name, struct lock_class_key *rwsem_key)  { diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index c7471c3fb798..fe9ca92faa2a 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * Queued read/write locks   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - *   * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.   *   * Authors: Waiman Long <waiman.long@hp.com> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5e9247dc2515..2473f10c6956 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * Queued spinlock   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - *   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.   * (C) Copyright 2013-2014,2018 Red Hat, Inc.   * (C) Copyright 2015 Intel Corp. @@ -395,7 +386,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	 * 0,1,0 -> 0,0,1  	 */  	clear_pending_set_locked(lock); -	qstat_inc(qstat_lock_pending, true); +	lockevent_inc(lock_pending);  	return;  	/* @@ -403,7 +394,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	 * queuing.  	 */  queue: -	qstat_inc(qstat_lock_slowpath, true); +	lockevent_inc(lock_slowpath);  pv_queue:  	node = this_cpu_ptr(&qnodes[0].mcs);  	idx = node->count++; @@ -419,7 +410,7 @@ pv_queue:  	 * simple enough.  	 */  	if (unlikely(idx >= MAX_NODES)) { -		qstat_inc(qstat_lock_no_node, true); +		lockevent_inc(lock_no_node);  		while (!queued_spin_trylock(lock))  			cpu_relax();  		goto release; @@ -430,7 +421,7 @@ pv_queue:  	/*  	 * Keep counts of non-zero index values:  	 */ -	qstat_inc(qstat_lock_use_node2 + idx - 1, idx); +	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);  	/*  	 * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)  		if (!(val & _Q_LOCKED_PENDING_MASK) &&  		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { -			qstat_inc(qstat_pv_lock_stealing, true); +			lockevent_inc(pv_lock_stealing);  			return true;  		}  		if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) @@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)  		hopcnt++;  		if (!cmpxchg(&he->lock, NULL, lock)) {  			WRITE_ONCE(he->node, node); -			qstat_hop(hopcnt); +			lockevent_pv_hop(hopcnt);  			return &he->lock;  		}  	} @@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)  		smp_store_mb(pn->state, vcpu_halted);  		if (!READ_ONCE(node->locked)) { -			qstat_inc(qstat_pv_wait_node, true); -			qstat_inc(qstat_pv_wait_early, wait_early); +			lockevent_inc(pv_wait_node); +			lockevent_cond_inc(pv_wait_early, wait_early);  			pv_wait(&pn->state, vcpu_halted);  		} @@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)  		 * So it is better to spin for a while in the hope that the  		 * MCS lock will be released soon.  		 */ -		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); +		lockevent_cond_inc(pv_spurious_wakeup, +				  !READ_ONCE(node->locked));  	}  	/* @@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)  	/*  	 * Tracking # of slowpath locking operations  	 */ -	qstat_inc(qstat_lock_slowpath, true); +	lockevent_inc(lock_slowpath);  	for (;; waitcnt++) {  		/* @@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)  			}  		}  		WRITE_ONCE(pn->state, vcpu_hashed); -		qstat_inc(qstat_pv_wait_head, true); -		qstat_inc(qstat_pv_wait_again, waitcnt); +		lockevent_inc(pv_wait_head); +		lockevent_cond_inc(pv_wait_again, waitcnt);  		pv_wait(&lock->locked, _Q_SLOW_VAL);  		/* @@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)  	 * vCPU is harmless other than the additional latency in completing  	 * the unlock.  	 */ -	qstat_inc(qstat_pv_kick_unlock, true); +	lockevent_inc(pv_kick_unlock);  	pv_kick(node->cpu);  } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d73f85388d5c..e625bb410aa2 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -1,270 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */  /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version.   * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com>   */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * <debugfs>/qlockstat/ - *   pv_hash_hops	- average # of hops per hashing operation - *   pv_kick_unlock	- # of vCPU kicks issued at unlock time - *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake - *   pv_latency_kick	- average latency (ns) of vCPU kick operation - *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup - *   pv_lock_stealing	- # of lock stealing operations - *   pv_spurious_wakeup	- # of spurious wakeups in non-head vCPUs - *   pv_wait_again	- # of wait's after a queue head vCPU kick - *   pv_wait_early	- # of early vCPU wait's - *   pv_wait_head	- # of vCPU wait's at the queue head - *   pv_wait_node	- # of vCPU wait's at a non-head queue node - *   lock_pending	- # of locking operations via pending code - *   lock_slowpath	- # of locking operations via MCS lock queue - *   lock_use_node2	- # of locking operations that use 2nd per-CPU node - *   lock_use_node3	- # of locking operations that use 3rd per-CPU node - *   lock_use_node4	- # of locking operations that use 4th per-CPU node - *   lock_no_node	- # of locking operations without using per-CPU node - * - * Subtracting lock_use_node[234] from lock_slowpath will give you - * lock_use_node1. - * - * Writing to the "reset_counters" file will reset all the above counter - * values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ -enum qlock_stats { -	qstat_pv_hash_hops, -	qstat_pv_kick_unlock, -	qstat_pv_kick_wake, -	qstat_pv_latency_kick, -	qstat_pv_latency_wake, -	qstat_pv_lock_stealing, -	qstat_pv_spurious_wakeup, -	qstat_pv_wait_again, -	qstat_pv_wait_early, -	qstat_pv_wait_head, -	qstat_pv_wait_node, -	qstat_lock_pending, -	qstat_lock_slowpath, -	qstat_lock_use_node2, -	qstat_lock_use_node3, -	qstat_lock_use_node4, -	qstat_lock_no_node, -	qstat_num,	/* Total number of statistical counters */ -	qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS  /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts   */ -#include <linux/debugfs.h>  #include <linux/sched.h>  #include <linux/sched/clock.h>  #include <linux/fs.h> -static const char * const qstat_names[qstat_num + 1] = { -	[qstat_pv_hash_hops]	   = "pv_hash_hops", -	[qstat_pv_kick_unlock]     = "pv_kick_unlock", -	[qstat_pv_kick_wake]       = "pv_kick_wake", -	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", -	[qstat_pv_latency_kick]	   = "pv_latency_kick", -	[qstat_pv_latency_wake]    = "pv_latency_wake", -	[qstat_pv_lock_stealing]   = "pv_lock_stealing", -	[qstat_pv_wait_again]      = "pv_wait_again", -	[qstat_pv_wait_early]      = "pv_wait_early", -	[qstat_pv_wait_head]       = "pv_wait_head", -	[qstat_pv_wait_node]       = "pv_wait_node", -	[qstat_lock_pending]       = "lock_pending", -	[qstat_lock_slowpath]      = "lock_slowpath", -	[qstat_lock_use_node2]	   = "lock_use_node2", -	[qstat_lock_use_node3]	   = "lock_use_node3", -	[qstat_lock_use_node4]	   = "lock_use_node4", -	[qstat_lock_no_node]	   = "lock_no_node", -	[qstat_reset_cnts]         = "reset_counters", -}; +#define EVENT_COUNT(ev)	lockevents[LOCKEVENT_ ## ev]  /* - * Per-cpu counters + * PV specific per-cpu counter   */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);  static DEFINE_PER_CPU(u64, pv_kick_time);  /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts.   *   * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick   *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake   *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops   *    Average hops/hash = pv_hash_hops/pv_kick_unlock   */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, -			  size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, +		       size_t count, loff_t *ppos)  {  	char buf[64]; -	int cpu, counter, len; -	u64 stat = 0, kicks = 0; +	int cpu, id, len; +	u64 sum = 0, kicks = 0;  	/*  	 * Get the counter ID stored in file->f_inode->i_private  	 */ -	counter = (long)file_inode(file)->i_private; +	id = (long)file_inode(file)->i_private; -	if (counter >= qstat_num) +	if (id >= lockevent_num)  		return -EBADF;  	for_each_possible_cpu(cpu) { -		stat += per_cpu(qstats[counter], cpu); +		sum += per_cpu(lockevents[id], cpu);  		/* -		 * Need to sum additional counter for some of them +		 * Need to sum additional counters for some of them  		 */ -		switch (counter) { +		switch (id) { -		case qstat_pv_latency_kick: -		case qstat_pv_hash_hops: -			kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); +		case LOCKEVENT_pv_latency_kick: +		case LOCKEVENT_pv_hash_hops: +			kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);  			break; -		case qstat_pv_latency_wake: -			kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); +		case LOCKEVENT_pv_latency_wake: +			kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);  			break;  		}  	} -	if (counter == qstat_pv_hash_hops) { +	if (id == LOCKEVENT_pv_hash_hops) {  		u64 frac = 0;  		if (kicks) { -			frac = 100ULL * do_div(stat, kicks); +			frac = 100ULL * do_div(sum, kicks);  			frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);  		}  		/*  		 * Return a X.XX decimal number  		 */ -		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); +		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", +			       sum, frac);  	} else {  		/*  		 * Round to the nearest ns  		 */ -		if ((counter == qstat_pv_latency_kick) || -		    (counter == qstat_pv_latency_wake)) { +		if ((id == LOCKEVENT_pv_latency_kick) || +		    (id == LOCKEVENT_pv_latency_wake)) {  			if (kicks) -				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); +				sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);  		} -		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); +		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);  	}  	return simple_read_from_buffer(user_buf, count, ppos, buf, len);  }  /* - * Function to handle write request - * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. - */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, -			   size_t count, loff_t *ppos) -{ -	int cpu; - -	/* -	 * Get the counter ID stored in file->f_inode->i_private -	 */ -	if ((long)file_inode(file)->i_private != qstat_reset_cnts) -		return count; - -	for_each_possible_cpu(cpu) { -		int i; -		unsigned long *ptr = per_cpu_ptr(qstats, cpu); - -		for (i = 0 ; i < qstat_num; i++) -			WRITE_ONCE(ptr[i], 0); -	} -	return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_qstat = { -	.read = qstat_read, -	.write = qstat_write, -	.llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ -	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); -	int i; - -	if (!d_qstat) -		goto out; - -	/* -	 * Create the debugfs files -	 * -	 * As reading from and writing to the stat files can be slow, only -	 * root is allowed to do the read/write to limit impact to system -	 * performance. -	 */ -	for (i = 0; i < qstat_num; i++) -		if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, -					 (void *)(long)i, &fops_qstat)) -			goto fail_undo; - -	if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, -				 (void *)(long)qstat_reset_cnts, &fops_qstat)) -		goto fail_undo; - -	return 0; -fail_undo: -	debugfs_remove_recursive(d_qstat); -out: -	pr_warn("Could not create 'qlockstat' debugfs entries\n"); -	return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ -	if (cond) -		this_cpu_inc(qstats[stat]); -} - -/*   * PV hash hop count   */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt)  { -	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); +	this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);  }  /* @@ -276,7 +111,7 @@ static inline void __pv_kick(int cpu)  	per_cpu(pv_kick_time, cpu) = start;  	pv_kick(cpu); -	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); +	this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);  }  /* @@ -289,18 +124,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)  	*pkick_time = 0;  	pv_wait(ptr, val);  	if (*pkick_time) { -		this_cpu_add(qstats[qstat_pv_latency_wake], +		this_cpu_add(EVENT_COUNT(pv_latency_wake),  			     sched_clock() - *pkick_time); -		qstat_inc(qstat_pv_kick_wake, true); +		lockevent_inc(pv_kick_wake);  	}  }  #define pv_kick(c)	__pv_kick(c)  #define pv_wait(p, v)	__pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ -static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ } -static inline void qstat_hop(int hopcnt)			{ } +static inline void lockevent_pv_hop(int hopcnt)	{ } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 978d63a8261c..38fbf9fa7f1b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * RT-Mutexes: simple blocking mutual exclusion locks with PI support   * diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001   David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/export.h> - -enum rwsem_waiter_type { -	RWSEM_WAITING_FOR_WRITE, -	RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { -	struct list_head list; -	struct task_struct *task; -	enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ -	int ret = 1; -	unsigned long flags; - -	if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { -		ret = (sem->count != 0); -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -	} -	return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, -		  struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -	/* -	 * Make sure we are not reinitializing a held semaphore: -	 */ -	debug_check_no_locks_freed((void *)sem, sizeof(*sem)); -	lockdep_init_map(&sem->dep_map, name, key, 0); -#endif -	sem->count = 0; -	raw_spin_lock_init(&sem->wait_lock); -	INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - *   - the 'active count' _reached_ zero - *   - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ -	struct rwsem_waiter *waiter; -	struct task_struct *tsk; -	int woken; - -	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - -	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { -		if (wakewrite) -			/* Wake up a writer. Note that we do not grant it the -			 * lock - it will have to acquire it when it runs. */ -			wake_up_process(waiter->task); -		goto out; -	} - -	/* grant an infinite number of read locks to the front of the queue */ -	woken = 0; -	do { -		struct list_head *next = waiter->list.next; - -		list_del(&waiter->list); -		tsk = waiter->task; -		/* -		 * Make sure we do not wakeup the next reader before -		 * setting the nil condition to grant the next reader; -		 * otherwise we could miss the wakeup on the other -		 * side and end up sleeping again. See the pairing -		 * in rwsem_down_read_failed(). -		 */ -		smp_mb(); -		waiter->task = NULL; -		wake_up_process(tsk); -		put_task_struct(tsk); -		woken++; -		if (next == &sem->wait_list) -			break; -		waiter = list_entry(next, struct rwsem_waiter, list); -	} while (waiter->type != RWSEM_WAITING_FOR_WRITE); - -	sem->count += woken; - - out: -	return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ -	struct rwsem_waiter *waiter; - -	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); -	wake_up_process(waiter->task); - -	return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ -	struct rwsem_waiter waiter; -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (sem->count >= 0 && list_empty(&sem->wait_list)) { -		/* granted */ -		sem->count++; -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -		goto out; -	} - -	/* set up my own style of waitqueue */ -	waiter.task = current; -	waiter.type = RWSEM_WAITING_FOR_READ; -	get_task_struct(current); - -	list_add_tail(&waiter.list, &sem->wait_list); - -	/* wait to be given the lock */ -	for (;;) { -		if (!waiter.task) -			break; -		if (signal_pending_state(state, current)) -			goto out_nolock; -		set_current_state(state); -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -		schedule(); -		raw_spin_lock_irqsave(&sem->wait_lock, flags); -	} - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: -	return 0; - -out_nolock: -	/* -	 * We didn't take the lock, so that there is a writer, which -	 * is owner or the first waiter of the sem. If it's a waiter, -	 * it will be woken by current owner. Not need to wake anybody. -	 */ -	list_del(&waiter.list); -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -	return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ -	__down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ -	return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ -	unsigned long flags; -	int ret = 0; - - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (sem->count >= 0 && list_empty(&sem->wait_list)) { -		/* granted */ -		sem->count++; -		ret = 1; -	} - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ -	struct rwsem_waiter waiter; -	unsigned long flags; -	int ret = 0; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	/* set up my own style of waitqueue */ -	waiter.task = current; -	waiter.type = RWSEM_WAITING_FOR_WRITE; -	list_add_tail(&waiter.list, &sem->wait_list); - -	/* wait for someone to release the lock */ -	for (;;) { -		/* -		 * That is the key to support write lock stealing: allows the -		 * task already on CPU to get the lock soon rather than put -		 * itself into sleep and waiting for system woke it or someone -		 * else in the head of the wait list up. -		 */ -		if (sem->count == 0) -			break; -		if (signal_pending_state(state, current)) -			goto out_nolock; - -		set_current_state(state); -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -		schedule(); -		raw_spin_lock_irqsave(&sem->wait_lock, flags); -	} -	/* got the lock */ -	sem->count = -1; -	list_del(&waiter.list); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return ret; - -out_nolock: -	list_del(&waiter.list); -	if (!list_empty(&sem->wait_list) && sem->count >= 0) -		__rwsem_do_wake(sem, 0); -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ -	__down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ -	return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ -	unsigned long flags; -	int ret = 0; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (sem->count == 0) { -		/* got the lock */ -		sem->count = -1; -		ret = 1; -	} - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (--sem->count == 0 && !list_empty(&sem->wait_list)) -		sem = __rwsem_wake_one_writer(sem); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	sem->count = 0; -	if (!list_empty(&sem->wait_list)) -		sem = __rwsem_do_wake(sem, 1); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	sem->count = 1; -	if (!list_empty(&sem->wait_list)) -		sem = __rwsem_do_wake(sem, 0); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fbe96341beee..0b1f77957240 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  {  	struct rwsem_waiter *waiter, *tmp;  	long oldcount, woken = 0, adjustment = 0; +	struct list_head wlist;  	/*  	 * Take a peek at the queue head waiter such that we can determine @@ -147,6 +148,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  			 * will notice the queued writer.  			 */  			wake_q_add(wake_q, waiter->task); +			lockevent_inc(rwsem_wake_writer);  		}  		return; @@ -176,9 +178,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  			goto try_reader_grant;  		}  		/* -		 * It is not really necessary to set it to reader-owned here, -		 * but it gives the spinners an early indication that the -		 * readers now have the lock. +		 * Set it to reader-owned to give spinners an early +		 * indication that readers now have the lock.  		 */  		__rwsem_set_reader_owned(sem, waiter->task);  	} @@ -188,33 +189,28 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  	 * of the queue. We know that woken will be at least 1 as we accounted  	 * for above. Note we increment the 'active part' of the count by the  	 * number of readers before waking any processes up. +	 * +	 * We have to do wakeup in 2 passes to prevent the possibility that +	 * the reader count may be decremented before it is incremented. It +	 * is because the to-be-woken waiter may not have slept yet. So it +	 * may see waiter->task got cleared, finish its critical section and +	 * do an unlock before the reader count increment. +	 * +	 * 1) Collect the read-waiters in a separate list, count them and +	 *    fully increment the reader count in rwsem. +	 * 2) For each waiters in the new list, clear waiter->task and +	 *    put them into wake_q to be woken up later.  	 */ -	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { -		struct task_struct *tsk; - +	list_for_each_entry(waiter, &sem->wait_list, list) {  		if (waiter->type == RWSEM_WAITING_FOR_WRITE)  			break;  		woken++; -		tsk = waiter->task; - -		get_task_struct(tsk); -		list_del(&waiter->list); -		/* -		 * Ensure calling get_task_struct() before setting the reader -		 * waiter to nil such that rwsem_down_read_failed() cannot -		 * race with do_exit() by always holding a reference count -		 * to the task to wakeup. -		 */ -		smp_store_release(&waiter->task, NULL); -		/* -		 * Ensure issuing the wakeup (either by us or someone else) -		 * after setting the reader waiter to nil. -		 */ -		wake_q_add_safe(wake_q, tsk);  	} +	list_cut_before(&wlist, &sem->wait_list, &waiter->list);  	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; +	lockevent_cond_inc(rwsem_wake_reader, woken);  	if (list_empty(&sem->wait_list)) {  		/* hit end of list above */  		adjustment -= RWSEM_WAITING_BIAS; @@ -222,94 +218,29 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  	if (adjustment)  		atomic_long_add(adjustment, &sem->count); -} -/* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ -	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; -	struct rwsem_waiter waiter; -	DEFINE_WAKE_Q(wake_q); +	/* 2nd pass */ +	list_for_each_entry_safe(waiter, tmp, &wlist, list) { +		struct task_struct *tsk; -	waiter.task = current; -	waiter.type = RWSEM_WAITING_FOR_READ; +		tsk = waiter->task; +		get_task_struct(tsk); -	raw_spin_lock_irq(&sem->wait_lock); -	if (list_empty(&sem->wait_list)) {  		/* -		 * In case the wait queue is empty and the lock isn't owned -		 * by a writer, this reader can exit the slowpath and return -		 * immediately as its RWSEM_ACTIVE_READ_BIAS has already -		 * been set in the count. +		 * Ensure calling get_task_struct() before setting the reader +		 * waiter to nil such that rwsem_down_read_failed() cannot +		 * race with do_exit() by always holding a reference count +		 * to the task to wakeup.  		 */ -		if (atomic_long_read(&sem->count) >= 0) { -			raw_spin_unlock_irq(&sem->wait_lock); -			return sem; -		} -		adjustment += RWSEM_WAITING_BIAS; -	} -	list_add_tail(&waiter.list, &sem->wait_list); - -	/* we're now waiting on the lock, but no longer actively locking */ -	count = atomic_long_add_return(adjustment, &sem->count); - -	/* -	 * If there are no active locks, wake the front queued process(es). -	 * -	 * If there are no writers and we are first in the queue, -	 * wake our own waiter to join the existing active readers ! -	 */ -	if (count == RWSEM_WAITING_BIAS || -	    (count > RWSEM_WAITING_BIAS && -	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) -		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - -	raw_spin_unlock_irq(&sem->wait_lock); -	wake_up_q(&wake_q); - -	/* wait to be given the lock */ -	while (true) { -		set_current_state(state); -		if (!waiter.task) -			break; -		if (signal_pending_state(state, current)) { -			raw_spin_lock_irq(&sem->wait_lock); -			if (waiter.task) -				goto out_nolock; -			raw_spin_unlock_irq(&sem->wait_lock); -			break; -		} -		schedule(); +		smp_store_release(&waiter->task, NULL); +		/* +		 * Ensure issuing the wakeup (either by us or someone else) +		 * after setting the reader waiter to nil. +		 */ +		wake_q_add_safe(wake_q, tsk);  	} - -	__set_current_state(TASK_RUNNING); -	return sem; -out_nolock: -	list_del(&waiter.list); -	if (list_empty(&sem->wait_list)) -		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); -	raw_spin_unlock_irq(&sem->wait_lock); -	__set_current_state(TASK_RUNNING); -	return ERR_PTR(-EINTR);  } -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ -	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ -	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); -  /*   * This function must be called with the sem->wait_lock held to prevent   * race conditions between checking the rwsem wait list and setting the @@ -346,21 +277,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)   */  static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)  { -	long old, count = atomic_long_read(&sem->count); - -	while (true) { -		if (!(count == 0 || count == RWSEM_WAITING_BIAS)) -			return false; +	long count = atomic_long_read(&sem->count); -		old = atomic_long_cmpxchg_acquire(&sem->count, count, -				      count + RWSEM_ACTIVE_WRITE_BIAS); -		if (old == count) { +	while (!count || count == RWSEM_WAITING_BIAS) { +		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, +					count + RWSEM_ACTIVE_WRITE_BIAS)) {  			rwsem_set_owner(sem); +			lockevent_inc(rwsem_opt_wlock);  			return true;  		} - -		count = old;  	} +	return false;  }  static inline bool owner_on_cpu(struct task_struct *owner) @@ -481,6 +408,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  	osq_unlock(&sem->osq);  done:  	preempt_enable(); +	lockevent_cond_inc(rwsem_opt_fail, !taken);  	return taken;  } @@ -505,6 +433,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)  #endif  /* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ +	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; +	struct rwsem_waiter waiter; +	DEFINE_WAKE_Q(wake_q); + +	waiter.task = current; +	waiter.type = RWSEM_WAITING_FOR_READ; + +	raw_spin_lock_irq(&sem->wait_lock); +	if (list_empty(&sem->wait_list)) { +		/* +		 * In case the wait queue is empty and the lock isn't owned +		 * by a writer, this reader can exit the slowpath and return +		 * immediately as its RWSEM_ACTIVE_READ_BIAS has already +		 * been set in the count. +		 */ +		if (atomic_long_read(&sem->count) >= 0) { +			raw_spin_unlock_irq(&sem->wait_lock); +			rwsem_set_reader_owned(sem); +			lockevent_inc(rwsem_rlock_fast); +			return sem; +		} +		adjustment += RWSEM_WAITING_BIAS; +	} +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we're now waiting on the lock, but no longer actively locking */ +	count = atomic_long_add_return(adjustment, &sem->count); + +	/* +	 * If there are no active locks, wake the front queued process(es). +	 * +	 * If there are no writers and we are first in the queue, +	 * wake our own waiter to join the existing active readers ! +	 */ +	if (count == RWSEM_WAITING_BIAS || +	    (count > RWSEM_WAITING_BIAS && +	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) +		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + +	raw_spin_unlock_irq(&sem->wait_lock); +	wake_up_q(&wake_q); + +	/* wait to be given the lock */ +	while (true) { +		set_current_state(state); +		if (!waiter.task) +			break; +		if (signal_pending_state(state, current)) { +			raw_spin_lock_irq(&sem->wait_lock); +			if (waiter.task) +				goto out_nolock; +			raw_spin_unlock_irq(&sem->wait_lock); +			break; +		} +		schedule(); +		lockevent_inc(rwsem_sleep_reader); +	} + +	__set_current_state(TASK_RUNNING); +	lockevent_inc(rwsem_rlock); +	return sem; +out_nolock: +	list_del(&waiter.list); +	if (list_empty(&sem->wait_list)) +		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); +	raw_spin_unlock_irq(&sem->wait_lock); +	__set_current_state(TASK_RUNNING); +	lockevent_inc(rwsem_rlock_fail); +	return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ +	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ +	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + +/*   * Wait until we successfully acquire the write lock   */  static inline struct rw_semaphore * @@ -580,6 +599,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  				goto out_nolock;  			schedule(); +			lockevent_inc(rwsem_sleep_writer);  			set_current_state(state);  		} while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); @@ -588,6 +608,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  	__set_current_state(TASK_RUNNING);  	list_del(&waiter.list);  	raw_spin_unlock_irq(&sem->wait_lock); +	lockevent_inc(rwsem_wlock);  	return ret; @@ -601,6 +622,7 @@ out_nolock:  		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);  	raw_spin_unlock_irq(&sem->wait_lock);  	wake_up_q(&wake_q); +	lockevent_inc(rwsem_wlock_fail);  	return ERR_PTR(-EINTR);  } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..ccbf18f560ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)  	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -	rwsem_set_reader_owned(sem);  }  EXPORT_SYMBOL(down_read); @@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)  		return -EINTR;  	} -	rwsem_set_reader_owned(sem);  	return 0;  } @@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)  {  	int ret = __down_read_trylock(sem); -	if (ret == 1) { +	if (ret == 1)  		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); -		rwsem_set_reader_owned(sem); -	}  	return ret;  } @@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)  	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -	rwsem_set_owner(sem);  }  EXPORT_SYMBOL(down_write); @@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)  		return -EINTR;  	} -	rwsem_set_owner(sem);  	return 0;  } @@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)  {  	int ret = __down_write_trylock(sem); -	if (ret == 1) { +	if (ret == 1)  		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); -		rwsem_set_owner(sem); -	}  	return ret;  } @@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);  void up_read(struct rw_semaphore *sem)  {  	rwsem_release(&sem->dep_map, 1, _RET_IP_); -	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); -	rwsem_clear_reader_owned(sem);  	__up_read(sem);  } @@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);  void up_write(struct rw_semaphore *sem)  {  	rwsem_release(&sem->dep_map, 1, _RET_IP_); -	DEBUG_RWSEMS_WARN_ON(sem->owner != current); -	rwsem_clear_owner(sem);  	__up_write(sem);  } @@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);  void downgrade_write(struct rw_semaphore *sem)  {  	lock_downgrade(&sem->dep_map, _RET_IP_); -	DEBUG_RWSEMS_WARN_ON(sem->owner != current); -	rwsem_set_reader_owned(sem);  	__downgrade_write(sem);  } @@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)  	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -	rwsem_set_reader_owned(sem);  }  EXPORT_SYMBOL(down_read_nested); @@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)  	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -	rwsem_set_owner(sem);  }  EXPORT_SYMBOL(_down_write_nest_lock); @@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)  	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -	rwsem_set_owner(sem);  }  EXPORT_SYMBOL(down_write_nested); @@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)  		return -EINTR;  	} -	rwsem_set_owner(sem);  	return 0;  } @@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);  void up_read_non_owner(struct rw_semaphore *sem)  { -	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); +	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), +				sem);  	__up_read(sem);  } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..64877f5294e3 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -23,15 +23,44 @@   * is involved. Ideally we would like to track all the readers that own   * a rwsem, but the overhead is simply too big.   */ +#include "lock_events.h" +  #define RWSEM_READER_OWNED	(1UL << 0)  #define RWSEM_ANONYMOUSLY_OWNED	(1UL << 1)  #ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c)	DEBUG_LOCKS_WARN_ON(c) +# define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\ +	if (!debug_locks_silent &&				\ +	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ +		#c, atomic_long_read(&(sem)->count),		\ +		(long)((sem)->owner), (long)current,		\ +		list_empty(&(sem)->wait_list) ? "" : "not "))	\ +			debug_locks_off();			\ +	} while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. + * Adapted largely from include/asm-i386/rwsem.h + * by Paul Mackerras <paulus@samba.org>. + */ + +/* + * the semaphore definition + */ +#ifdef CONFIG_64BIT +# define RWSEM_ACTIVE_MASK		0xffffffffL  #else -# define DEBUG_RWSEMS_WARN_ON(c) +# define RWSEM_ACTIVE_MASK		0x0000ffffL  #endif +#define RWSEM_ACTIVE_BIAS		0x00000001L +#define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1) +#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +  #ifdef CONFIG_RWSEM_SPIN_ON_OWNER  /*   * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)  {  }  #endif + +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ +	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { +		rwsem_down_read_failed(sem); +		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & +					RWSEM_READER_OWNED), sem); +	} else { +		rwsem_set_reader_owned(sem); +	} +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ +	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { +		if (IS_ERR(rwsem_down_read_failed_killable(sem))) +			return -EINTR; +		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & +					RWSEM_READER_OWNED), sem); +	} else { +		rwsem_set_reader_owned(sem); +	} +	return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ +	/* +	 * Optimize for the case when the rwsem is not locked at all. +	 */ +	long tmp = RWSEM_UNLOCKED_VALUE; + +	lockevent_inc(rwsem_rtrylock); +	do { +		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, +					tmp + RWSEM_ACTIVE_READ_BIAS)) { +			rwsem_set_reader_owned(sem); +			return 1; +		} +	} while (tmp >= 0); +	return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ +	long tmp; + +	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, +					     &sem->count); +	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) +		rwsem_down_write_failed(sem); +	rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ +	long tmp; + +	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, +					     &sem->count); +	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) +		if (IS_ERR(rwsem_down_write_failed_killable(sem))) +			return -EINTR; +	rwsem_set_owner(sem); +	return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ +	long tmp; + +	lockevent_inc(rwsem_wtrylock); +	tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, +		      RWSEM_ACTIVE_WRITE_BIAS); +	if (tmp == RWSEM_UNLOCKED_VALUE) { +		rwsem_set_owner(sem); +		return true; +	} +	return false; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ +	long tmp; + +	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), +				sem); +	rwsem_clear_reader_owned(sem); +	tmp = atomic_long_dec_return_release(&sem->count); +	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) +		rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ +	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); +	rwsem_clear_owner(sem); +	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, +						    &sem->count) < 0)) +		rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ +	long tmp; + +	/* +	 * When downgrading from exclusive to shared ownership, +	 * anything inside the write-locked region cannot leak +	 * into the read side. In contrast, anything in the +	 * read-locked region is ok to be re-ordered into the +	 * write side. As such, rely on RELEASE semantics. +	 */ +	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); +	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); +	rwsem_set_reader_owned(sem); +	if (tmp < 0) +		rwsem_downgrade_wake(sem); +} diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -22,6 +22,13 @@  #include <linux/debug_locks.h>  #include <linux/export.h> +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif +  /*   * If lockdep is enabled then we use the non-preemption spin-ops   * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)  {  	debug_spin_lock_before(lock);  	arch_spin_lock(&lock->raw_lock); +	mmiowb_spin_lock();  	debug_spin_lock_after(lock);  } @@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)  {  	int ret = arch_spin_trylock(&lock->raw_lock); -	if (ret) +	if (ret) { +		mmiowb_spin_lock();  		debug_spin_lock_after(lock); +	}  #ifndef CONFIG_SMP  	/*  	 * Must not happen on UP: @@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)  void do_raw_spin_unlock(raw_spinlock_t *lock)  { +	mmiowb_spin_unlock();  	debug_spin_unlock(lock);  	arch_spin_unlock(&lock->raw_lock);  } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 65a3b7e55b9f..3e82f449b4ff 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * Module-based API test facility for ww_mutexes - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html.   */  #include <linux/kernel.h> diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,  	 */  	return devmem->page_fault(vma, addr, page, flags, pmdp);  } -EXPORT_SYMBOL(device_private_entry_fault);  #endif /* CONFIG_DEVICE_PRIVATE */  static void pgmap_array_delete(struct resource *res) @@ -148,6 +147,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)  			&pgmap->altmap : NULL;  	struct resource *res = &pgmap->res;  	struct dev_pagemap *conflict_pgmap; +	struct mhp_restrictions restrictions = { +		/* +		 * We do not want any optional features only our own memmap +		*/ +		.altmap = altmap, +	};  	pgprot_t pgprot = PAGE_KERNEL;  	int error, nid, is_ram; @@ -214,7 +219,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)  	 */  	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {  		error = add_pages(nid, align_start >> PAGE_SHIFT, -				align_size >> PAGE_SHIFT, NULL, false); +				align_size >> PAGE_SHIFT, &restrictions);  	} else {  		error = kasan_add_zero_shadow(__va(align_start), align_size);  		if (error) { @@ -222,8 +227,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)  			goto err_kasan;  		} -		error = arch_add_memory(nid, align_start, align_size, altmap, -				false); +		error = arch_add_memory(nid, align_start, align_size, +					&restrictions);  	}  	if (!error) { diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 79c9be2dbbe9..33783abc377b 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */  /* Module internals   *   * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.   * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version.   */  #include <linux/elf.h> @@ -20,7 +16,7 @@ struct load_info {  	unsigned long len;  	Elf_Shdr *sechdrs;  	char *secstrings, *strtab; -	unsigned long symoffs, stroffs; +	unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;  	struct _ddebug *debug;  	unsigned int num_debug;  	bool sig_ok; diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..80c7c09584cf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*     Copyright (C) 2002 Richard Henderson     Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. -    This program is free software; you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation; either version 2 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program; if not, write to the Free Software -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  */  #include <linux/export.h>  #include <linux/extable.h> @@ -98,6 +86,10 @@ DEFINE_MUTEX(module_mutex);  EXPORT_SYMBOL_GPL(module_mutex);  static LIST_HEAD(modules); +/* Work queue for freeing init sections in success case */ +static struct work_struct init_free_wq; +static struct llist_head init_free_list; +  #ifdef CONFIG_MODULES_TREE_LOOKUP  /* @@ -286,6 +278,11 @@ bool is_module_sig_enforced(void)  }  EXPORT_SYMBOL(is_module_sig_enforced); +void set_module_sig_enforced(void) +{ +	sig_enforce = true; +} +  /* Block module loading/unloading? */  int modules_disabled = 0;  core_param(nomodule, modules_disabled, bint, 0); @@ -1949,9 +1946,16 @@ void module_enable_ro(const struct module *mod, bool after_init)  	if (!rodata_enabled)  		return; +	set_vm_flush_reset_perms(mod->core_layout.base); +	set_vm_flush_reset_perms(mod->init_layout.base);  	frob_text(&mod->core_layout, set_memory_ro); +	frob_text(&mod->core_layout, set_memory_x); +  	frob_rodata(&mod->core_layout, set_memory_ro); +  	frob_text(&mod->init_layout, set_memory_ro); +	frob_text(&mod->init_layout, set_memory_x); +  	frob_rodata(&mod->init_layout, set_memory_ro);  	if (after_init) @@ -1967,15 +1971,6 @@ static void module_enable_nx(const struct module *mod)  	frob_writable_data(&mod->init_layout, set_memory_nx);  } -static void module_disable_nx(const struct module *mod) -{ -	frob_rodata(&mod->core_layout, set_memory_x); -	frob_ro_after_init(&mod->core_layout, set_memory_x); -	frob_writable_data(&mod->core_layout, set_memory_x); -	frob_rodata(&mod->init_layout, set_memory_x); -	frob_writable_data(&mod->init_layout, set_memory_x); -} -  /* Iterate through all modules and set each module's text as RW */  void set_all_modules_text_rw(void)  { @@ -2019,23 +2014,8 @@ void set_all_modules_text_ro(void)  	}  	mutex_unlock(&module_mutex);  } - -static void disable_ro_nx(const struct module_layout *layout) -{ -	if (rodata_enabled) { -		frob_text(layout, set_memory_rw); -		frob_rodata(layout, set_memory_rw); -		frob_ro_after_init(layout, set_memory_rw); -	} -	frob_rodata(layout, set_memory_x); -	frob_ro_after_init(layout, set_memory_x); -	frob_writable_data(layout, set_memory_x); -} -  #else -static void disable_ro_nx(const struct module_layout *layout) { }  static void module_enable_nx(const struct module *mod) { } -static void module_disable_nx(const struct module *mod) { }  #endif  #ifdef CONFIG_LIVEPATCH @@ -2115,6 +2095,11 @@ static void free_module_elf(struct module *mod)  void __weak module_memfree(void *module_region)  { +	/* +	 * This memory may be RO, and freeing RO memory in an interrupt is not +	 * supported by vmalloc. +	 */ +	WARN_ON(in_interrupt());  	vfree(module_region);  } @@ -2166,7 +2151,6 @@ static void free_module(struct module *mod)  	mutex_unlock(&module_mutex);  	/* This may be empty, but that's OK */ -	disable_ro_nx(&mod->init_layout);  	module_arch_freeing_init(mod);  	module_memfree(mod->init_layout.base);  	kfree(mod->args); @@ -2176,7 +2160,6 @@ static void free_module(struct module *mod)  	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);  	/* Finally, free the core (containing the module structure) */ -	disable_ro_nx(&mod->core_layout);  	module_memfree(mod->core_layout.base);  } @@ -2647,6 +2630,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)  	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);  	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);  	mod->core_layout.size += strtab_size; +	info->core_typeoffs = mod->core_layout.size; +	mod->core_layout.size += ndst * sizeof(char);  	mod->core_layout.size = debug_align(mod->core_layout.size);  	/* Put string table section at end of init part of module. */ @@ -2660,6 +2645,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)  				      __alignof__(struct mod_kallsyms));  	info->mod_kallsyms_init_off = mod->init_layout.size;  	mod->init_layout.size += sizeof(struct mod_kallsyms); +	info->init_typeoffs = mod->init_layout.size; +	mod->init_layout.size += nsrc * sizeof(char);  	mod->init_layout.size = debug_align(mod->init_layout.size);  } @@ -2683,20 +2670,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)  	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);  	/* Make sure we get permanent strtab: don't use info->strtab. */  	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; +	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; -	/* Set types up while we still have access to sections. */ -	for (i = 0; i < mod->kallsyms->num_symtab; i++) -		mod->kallsyms->symtab[i].st_size -			= elf_type(&mod->kallsyms->symtab[i], info); - -	/* Now populate the cut down core kallsyms for after init. */ +	/* +	 * Now populate the cut down core kallsyms for after init +	 * and set types up while we still have access to sections. +	 */  	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;  	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; +	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;  	src = mod->kallsyms->symtab;  	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { +		mod->kallsyms->typetab[i] = elf_type(src + i, info);  		if (i == 0 || is_livepatch_module(mod) ||  		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,  				   info->index.pcpu)) { +			mod->core_kallsyms.typetab[ndst] = +			    mod->kallsyms->typetab[i];  			dst[ndst] = src[i];  			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;  			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -3415,16 +3405,33 @@ static void do_mod_ctors(struct module *mod)  /* For freeing module_init on success, in case kallsyms traversing */  struct mod_initfree { -	struct rcu_head rcu; +	struct llist_node node;  	void *module_init;  }; -static void do_free_init(struct rcu_head *head) +static void do_free_init(struct work_struct *w)  { -	struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); -	module_memfree(m->module_init); -	kfree(m); +	struct llist_node *pos, *n, *list; +	struct mod_initfree *initfree; + +	list = llist_del_all(&init_free_list); + +	synchronize_rcu(); + +	llist_for_each_safe(pos, n, list) { +		initfree = container_of(pos, struct mod_initfree, node); +		module_memfree(initfree->module_init); +		kfree(initfree); +	} +} + +static int __init modules_wq_init(void) +{ +	INIT_WORK(&init_free_wq, do_free_init); +	init_llist_head(&init_free_list); +	return 0;  } +module_init(modules_wq_init);  /*   * This is where the real work happens. @@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)  #endif  	module_enable_ro(mod, true);  	mod_tree_remove_init(mod); -	disable_ro_nx(&mod->init_layout);  	module_arch_freeing_init(mod);  	mod->init_layout.base = NULL;  	mod->init_layout.size = 0; @@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)  	 * We want to free module_init, but be aware that kallsyms may be  	 * walking this with preempt disabled.  In all the failure paths, we  	 * call synchronize_rcu(), but we don't want to slow down the success -	 * path, so use actual RCU here. +	 * path. module_memfree() cannot be called in an interrupt, so do the +	 * work and call synchronize_rcu() in a work queue. +	 *  	 * Note that module_alloc() on most architectures creates W+X page  	 * mappings which won't be cleaned up until do_free_init() runs.  Any  	 * code such as mark_rodata_ro() which depends on those mappings to  	 * be cleaned up needs to sync with the queued work - ie  	 * rcu_barrier()  	 */ -	call_rcu(&freeinit->rcu, do_free_init); +	if (llist_add(&freeinit->node, &init_free_list)) +		schedule_work(&init_free_wq); +  	mutex_unlock(&module_mutex);  	wake_up_all(&module_wq); @@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,  	module_bug_cleanup(mod);  	mutex_unlock(&module_mutex); -	/* we can't deallocate the module until we clear memory protection */ -	module_disable_ro(mod); -	module_disable_nx(mod); -   ddebug_cleanup:  	ftrace_release_mod(mod);  	dynamic_debug_remove(mod, info->debug); @@ -4080,7 +4086,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,  			const Elf_Sym *sym = &kallsyms->symtab[symnum];  			*value = kallsyms_symbol_value(sym); -			*type = sym->st_size; +			*type = kallsyms->typetab[symnum];  			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);  			strlcpy(module_name, mod->name, MODULE_NAME_LEN);  			*exported = is_exported(name, *value, mod); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6b9a926fd86b..b10fb1986ca9 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* Module signature checker   *   * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.   * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version.   */  #include <linux/kernel.h> diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..d9f5081d578d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #include <linux/kdebug.h>  #include <linux/kprobes.h>  #include <linux/export.h> @@ -22,6 +23,7 @@ static int notifier_chain_register(struct notifier_block **nl,  		struct notifier_block *n)  {  	while ((*nl) != NULL) { +		WARN_ONCE(((*nl) == n), "double register detected");  		if (n->priority > (*nl)->priority)  			break;  		nl = &((*nl)->next); diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = {  	¶llel_cpumask_attr.attr,  	NULL,  }; +ATTRIBUTE_GROUPS(padata_default);  static ssize_t padata_sysfs_show(struct kobject *kobj,  				 struct attribute *attr, char *buf) @@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = {  static struct kobj_type padata_attr_type = {  	.sysfs_ops = &padata_sysfs_ops, -	.default_attrs = padata_default_attrs, +	.default_groups = padata_default_groups,  	.release = padata_sysfs_release,  }; diff --git a/kernel/panic.c b/kernel/panic.c index 0ae0d7332f12..4d9f55bf7d38 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/panic.c   * @@ -51,6 +52,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);  #define PANIC_PRINT_TIMER_INFO		0x00000004  #define PANIC_PRINT_LOCK_INFO		0x00000008  #define PANIC_PRINT_FTRACE_INFO		0x00000010 +#define PANIC_PRINT_ALL_PRINTK_MSG	0x00000020  unsigned long panic_print;  ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -134,6 +136,9 @@ EXPORT_SYMBOL(nmi_panic);  static void panic_print_sys_info(void)  { +	if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) +		console_flush_on_panic(CONSOLE_REPLAY_ALL); +  	if (panic_print & PANIC_PRINT_TASK_INFO)  		show_state(); @@ -277,7 +282,7 @@ void panic(const char *fmt, ...)  	 * panic() is not being callled from OOPS.  	 */  	debug_locks_off(); -	console_flush_on_panic(); +	console_flush_on_panic(CONSOLE_FLUSH_PENDING);  	panic_print_sys_info(); @@ -306,6 +311,8 @@ void panic(const char *fmt, ...)  		 * shutting down.  But if there is a chance of  		 * rebooting the system it will be rebooted.  		 */ +		if (panic_reboot_mode != REBOOT_UNDEFINED) +			reboot_mode = panic_reboot_mode;  		emergency_restart();  	}  #ifdef __sparc__ @@ -318,14 +325,12 @@ void panic(const char *fmt, ...)  	}  #endif  #if defined(CONFIG_S390) -	{ -		unsigned long caller; - -		caller = (unsigned long)__builtin_return_address(0); -		disabled_wait(caller); -	} +	disabled_wait();  #endif  	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + +	/* Do not scroll important messages printed above */ +	suppress_printk = 1;  	local_irq_enable();  	for (i = 0; ; i += PANIC_TIMER_STEP) {  		touch_softlockup_watchdog(); diff --git a/kernel/params.c b/kernel/params.c index ce89f757e6da..cf448785d058 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /* Helpers for initial module or kernel cmdline parsing     Copyright (C) 2001 Rusty Russell. -    This program is free software; you can redistribute it and/or modify -    it under the terms of the GNU General Public License as published by -    the Free Software Foundation; either version 2 of the License, or -    (at your option) any later version. - -    This program is distributed in the hope that it will be useful, -    but WITHOUT ANY WARRANTY; without even the implied warranty of -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -    GNU General Public License for more details. - -    You should have received a copy of the GNU General Public License -    along with this program; if not, write to the Free Software -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  */  #include <linux/kernel.h>  #include <linux/string.h> diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..e5cad0c7d5dd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Generic pidhash and scalable, time-bounded PID allocator   * @@ -32,7 +33,6 @@  #include <linux/init.h>  #include <linux/rculist.h>  #include <linux/memblock.h> -#include <linux/hash.h>  #include <linux/pid_namespace.h>  #include <linux/init_task.h>  #include <linux/syscalls.h> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index aa6e72fb7c08..f54bc7cb6c2d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Pid namespaces   * diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index f8fe57d1022e..ff8592ddedee 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  config SUSPEND  	bool "Suspend to RAM and standby"  	depends on ARCH_SUSPEND_POSSIBLE @@ -114,6 +115,15 @@ config PM_SLEEP_SMP  	depends on PM_SLEEP  	select HOTPLUG_CPU +config PM_SLEEP_SMP_NONZERO_CPU +	def_bool y +	depends on PM_SLEEP_SMP +	depends on ARCH_SUSPEND_NONZERO_CPU +	---help--- +	If an arch can suspend (for suspend, hibernate, kexec, etc) on a +	non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This +	will allow nohz_full mask to include CPU0. +  config PM_AUTOSLEEP  	bool "Opportunistic sleep"  	depends on PM_SLEEP diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..97522630b1b6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,7 +14,6 @@  #include <linux/export.h>  #include <linux/suspend.h> -#include <linux/syscalls.h>  #include <linux/reboot.h>  #include <linux/string.h>  #include <linux/device.h> @@ -130,7 +129,7 @@ static int hibernation_test(int level) { return 0; }  static int platform_begin(int platform_mode)  {  	return (platform_mode && hibernation_ops) ? -		hibernation_ops->begin() : 0; +		hibernation_ops->begin(PMSG_FREEZE) : 0;  }  /** @@ -281,7 +280,7 @@ static int create_image(int platform_mode)  	if (error || hibernation_test(TEST_PLATFORM))  		goto Platform_finish; -	error = disable_nonboot_cpus(); +	error = suspend_disable_secondary_cpus();  	if (error || hibernation_test(TEST_CPUS))  		goto Enable_cpus; @@ -323,7 +322,7 @@ static int create_image(int platform_mode)  	local_irq_enable();   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Platform_finish:  	platform_finish(platform_mode); @@ -417,7 +416,7 @@ int hibernation_snapshot(int platform_mode)  int __weak hibernate_resume_nonboot_cpu_disable(void)  { -	return disable_nonboot_cpus(); +	return suspend_disable_secondary_cpus();  }  /** @@ -486,7 +485,7 @@ static int resume_target_kernel(bool platform_mode)  	local_irq_enable();   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Cleanup:  	platform_restore_cleanup(platform_mode); @@ -543,7 +542,7 @@ int hibernation_platform_enter(void)  	 * hibernation_ops->finish() before saving the image, so we should let  	 * the firmware know that we're going to enter the sleep state after all  	 */ -	error = hibernation_ops->begin(); +	error = hibernation_ops->begin(PMSG_HIBERNATE);  	if (error)  		goto Close; @@ -564,7 +563,7 @@ int hibernation_platform_enter(void)  	if (error)  		goto Platform_finish; -	error = disable_nonboot_cpus(); +	error = suspend_disable_secondary_cpus();  	if (error)  		goto Enable_cpus; @@ -586,7 +585,7 @@ int hibernation_platform_enter(void)  	local_irq_enable();   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Platform_finish:  	hibernation_ops->finish(); @@ -709,9 +708,7 @@ int hibernate(void)  		goto Exit;  	} -	pr_info("Syncing filesystems ... \n"); -	ksys_sync(); -	pr_info("done.\n"); +	ksys_sync_helper();  	error = freeze_processes();  	if (error) diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..4f43e724f6eb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -16,6 +16,7 @@  #include <linux/debugfs.h>  #include <linux/seq_file.h>  #include <linux/suspend.h> +#include <linux/syscalls.h>  #include "power.h" @@ -51,6 +52,19 @@ void unlock_system_sleep(void)  }  EXPORT_SYMBOL_GPL(unlock_system_sleep); +void ksys_sync_helper(void) +{ +	ktime_t start; +	long elapsed_msecs; + +	start = ktime_get(); +	ksys_sync(); +	elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); +	pr_info("Filesystems sync: %ld.%03ld seconds\n", +		elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); +  /* Routines for PM-transition notifications */  static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d22131afc1e..33e3febaba53 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * This module exposes the interface to kernel space for specifying   * QoS dependencies.  It provides infrastructure for registration of: diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f08a1e4ee1d4..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)   * safe_copy_page - Copy a page in a safe way.   *   * Check if the page we are going to copy is marked as present in the kernel - * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set - * and in that case kernel_page_present() always returns 'true'). + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'.   */  static void safe_copy_page(void *dst, struct page *s_page)  { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..ef908c134b34 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -17,7 +17,6 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/cpuidle.h> -#include <linux/syscalls.h>  #include <linux/gfp.h>  #include <linux/io.h>  #include <linux/kernel.h> @@ -428,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	if (suspend_test(TEST_PLATFORM))  		goto Platform_wake; -	error = disable_nonboot_cpus(); +	error = suspend_disable_secondary_cpus();  	if (error || suspend_test(TEST_CPUS))  		goto Enable_cpus; @@ -458,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	BUG_ON(irqs_disabled());   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Platform_wake:  	platform_resume_noirq(state); @@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state)  	if (state == PM_SUSPEND_TO_IDLE)  		s2idle_begin(); -#ifndef CONFIG_SUSPEND_SKIP_SYNC -	trace_suspend_resume(TPS("sync_filesystems"), 0, true); -	pr_info("Syncing filesystems ... "); -	ksys_sync(); -	pr_cont("done.\n"); -	trace_suspend_resume(TPS("sync_filesystems"), 0, false); -#endif +	if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { +		trace_suspend_resume(TPS("sync_filesystems"), 0, true); +		ksys_sync_helper(); +		trace_suspend_resume(TPS("sync_filesystems"), 0, false); +	}  	pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);  	pm_suspend_clear_flags(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..cb24e840a3e6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -10,7 +10,6 @@   */  #include <linux/suspend.h> -#include <linux/syscalls.h>  #include <linux/reboot.h>  #include <linux/string.h>  #include <linux/device.h> @@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		if (data->frozen)  			break; -		printk("Syncing filesystems ... "); -		ksys_sync(); -		printk("done.\n"); +		ksys_sync_helper();  		error = freeze_processes();  		if (error) diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4a2ffc39eb95..4d052fc6bcde 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  obj-y	= printk.o  obj-$(CONFIG_PRINTK)	+= printk_safe.o  obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 0f1898820cba..c8e6ab689d42 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -1,18 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */  /*   * internal.h - printk internal definitions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #include <linux/percpu.h> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..1888f6a3b694 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/printk.c   * @@ -86,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);  struct console *console_drivers;  EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; +  #ifdef CONFIG_LOCKDEP  static struct lockdep_map console_lock_dep_map = {  	.name = "console_lock" @@ -1943,6 +1950,10 @@ asmlinkage int vprintk_emit(int facility, int level,  	unsigned long flags;  	u64 curr_log_seq; +	/* Suppress unimportant messages after panic happens */ +	if (unlikely(suppress_printk)) +		return 0; +  	if (level == LOGLEVEL_SCHED) {  		level = LOGLEVEL_DEFAULT;  		in_sched = true; @@ -2525,10 +2536,11 @@ void console_unblank(void)  /**   * console_flush_on_panic - flush console content on panic + * @mode: flush all messages in buffer or just the pending ones   *   * Immediately output all pending messages no matter what.   */ -void console_flush_on_panic(void) +void console_flush_on_panic(enum con_flush_mode mode)  {  	/*  	 * If someone else is holding the console lock, trylock will fail @@ -2539,6 +2551,15 @@ void console_flush_on_panic(void)  	 */  	console_trylock();  	console_may_schedule = 0; + +	if (mode == CONSOLE_REPLAY_ALL) { +		unsigned long flags; + +		logbuf_lock_irqsave(flags); +		console_seq = log_first_seq; +		console_idx = log_first_idx; +		logbuf_unlock_irqrestore(flags); +	}  	console_unlock();  } diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 0913b4d385de..b4045e782743 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * printk_safe.c - Safe printk for printk-deadlock-prone contexts - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>.   */  #include <linux/preempt.h> diff --git a/kernel/profile.c b/kernel/profile.c index 9c08a2c7cb1d..af7c94bf5fa1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/profile.c   *  Simple profiling. Manages a direct-mapped profile hit count buffer, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 771e93f9c43f..5710d07e67cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * linux/kernel/ptrace.c   * @@ -29,6 +30,7 @@  #include <linux/hw_breakpoint.h>  #include <linux/cn_proc.h>  #include <linux/compat.h> +#include <linux/sched/signal.h>  /*   * Access another process' address space via ptrace. @@ -924,18 +926,26 @@ int ptrace_request(struct task_struct *child, long request,  			ret = ptrace_setsiginfo(child, &siginfo);  		break; -	case PTRACE_GETSIGMASK: +	case PTRACE_GETSIGMASK: { +		sigset_t *mask; +  		if (addr != sizeof(sigset_t)) {  			ret = -EINVAL;  			break;  		} -		if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) +		if (test_tsk_restore_sigmask(child)) +			mask = &child->saved_sigmask; +		else +			mask = &child->blocked; + +		if (copy_to_user(datavp, mask, sizeof(sigset_t)))  			ret = -EFAULT;  		else  			ret = 0;  		break; +	}  	case PTRACE_SETSIGMASK: {  		sigset_t new_set; @@ -961,6 +971,8 @@ int ptrace_request(struct task_struct *child, long request,  		child->blocked = new_set;  		spin_unlock_irq(&child->sighand->siglock); +		clear_tsk_restore_sigmask(child); +  		ret = 0;  		break;  	} diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 37301430970e..480edf328b51 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # RCU-related configuration options  # diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 0ec7d1d33a14..5ec3ea4028e2 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # RCU-related debugging configuration options  # diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index acee72c0b24b..390aab20115e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -11,11 +11,6 @@  #define __LINUX_RCU_H  #include <trace/events/rcu.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */  /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */  #define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1) @@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)  	rcu_lock_acquire(&rcu_callback_map);  	if (__is_kfree_rcu_offset(offset)) { -		RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) +		trace_rcu_invoke_kfree_callback(rn, head, offset);  		kfree((void *)head - offset);  		rcu_lock_release(&rcu_callback_map);  		return true;  	} else { -		RCU_TRACE(trace_rcu_invoke_callback(rn, head);) +		trace_rcu_invoke_callback(rn, head);  		f = head->func;  		WRITE_ONCE(head->func, (rcu_callback_t)0L);  		f(head); @@ -233,6 +228,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)  #ifdef CONFIG_RCU_STALL_COMMON  extern int rcu_cpu_stall_suppress; +extern int rcu_cpu_stall_timeout;  int rcu_jiffies_till_stall_check(void);  #define rcu_ftrace_dump_stall_suppress() \ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c29761152874..7a6890b23c5f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -494,6 +494,10 @@ rcu_perf_cleanup(void)  	if (torture_cleanup_begin())  		return; +	if (!cur_ops) { +		torture_cleanup_end(); +		return; +	}  	if (reader_tasks) {  		for (i = 0; i < nrealreaders; i++) @@ -614,6 +618,7 @@ rcu_perf_init(void)  		pr_cont("\n");  		WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));  		firsterr = -EINVAL; +		cur_ops = NULL;  		goto unwind;  	}  	if (cur_ops->init) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f14d1b18a74f..efaa5b3f4d3f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,7 +299,6 @@ struct rcu_torture_ops {  	int irq_capable;  	int can_boost;  	int extendables; -	int ext_irq_conflict;  	const char *name;  }; @@ -592,12 +591,7 @@ static void srcu_torture_init(void)  static void srcu_torture_cleanup(void)  { -	static DEFINE_TORTURE_RANDOM(rand); - -	if (torture_random(&rand) & 0x800) -		cleanup_srcu_struct(&srcu_ctld); -	else -		cleanup_srcu_struct_quiesced(&srcu_ctld); +	cleanup_srcu_struct(&srcu_ctld);  	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */  } @@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)  	unsigned long randmask2 = randmask1 >> 3;  	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); -	/* Most of the time lots of bits, half the time only one bit. */ +	/* Mostly only one bit (need preemption!), sometimes lots of bits. */  	if (!(randmask1 & 0x7))  		mask = mask & randmask2;  	else @@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)  	    ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||  	     (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))  		mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; -	if ((mask & RCUTORTURE_RDR_IRQ) && -	    !(mask & cur_ops->ext_irq_conflict) && -	    (oldmask & cur_ops->ext_irq_conflict)) -		mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */  	return mask ?: RCUTORTURE_RDR_RCU;  } @@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,  	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",  	     __func__);  	rcu_torture_fwd_cb_hist(); -	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); +	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);  	WRITE_ONCE(rcu_fwd_emergency_stop, true);  	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */  	pr_info("%s: Freed %lu RCU callbacks.\n", @@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void)  			cur_ops->cb_barrier();  		return;  	} +	if (!cur_ops) { +		torture_cleanup_end(); +		return; +	}  	rcu_torture_barrier_cleanup();  	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); @@ -2267,6 +2261,7 @@ rcu_torture_init(void)  		pr_cont("\n");  		WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));  		firsterr = -EINVAL; +		cur_ops = NULL;  		goto unwind;  	}  	if (cur_ops->fqs == NULL && fqs_duration != 0) { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5d4a39a6505a..44d6606b8325 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);   * Must invoke this after you are finished using a given srcu_struct that   * was initialized via init_srcu_struct(), else you leak memory.   */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +void cleanup_srcu_struct(struct srcu_struct *ssp)  {  	WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); -	if (quiesced) -		WARN_ON(work_pending(&ssp->srcu_work)); -	else -		flush_work(&ssp->srcu_work); +	flush_work(&ssp->srcu_work);  	WARN_ON(ssp->srcu_gp_running);  	WARN_ON(ssp->srcu_gp_waiting);  	WARN_ON(ssp->srcu_cb_head);  	WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);  } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a60b8ba9e1ac..9b761e546de8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)  	return SRCU_INTERVAL;  } -/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @ssp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *ssp)  {  	int cpu; @@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)  		return; /* Just leak it! */  	if (WARN_ON(srcu_readers_active(ssp)))  		return; /* Just leak it! */ -	if (quiesced) { -		if (WARN_ON(delayed_work_pending(&ssp->work))) -			return; /* Just leak it! */ -	} else { -		flush_delayed_work(&ssp->work); -	} +	flush_delayed_work(&ssp->work);  	for_each_possible_cpu(cpu) {  		struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); -		if (quiesced) { -			if (WARN_ON(timer_pending(&sdp->delay_work))) -				return; /* Just leak it! */ -			if (WARN_ON(work_pending(&sdp->work))) -				return; /* Just leak it! */ -		} else { -			del_timer_sync(&sdp->delay_work); -			flush_work(&sdp->work); -		} +		del_timer_sync(&sdp->delay_work); +		flush_work(&sdp->work); +		if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) +			return; /* Forgot srcu_barrier(), so just leak it! */  	}  	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||  	    WARN_ON(srcu_readers_active(ssp))) { @@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)  	free_percpu(ssp->sda);  	ssp->sda = NULL;  } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Counts the new reader in the appropriate per-CPU element of the diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 911bd9076d43..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -52,7 +52,7 @@ void rcu_qs(void)  	local_irq_save(flags);  	if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {  		rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; -		raise_softirq(RCU_SOFTIRQ); +		raise_softirq_irqoff(RCU_SOFTIRQ);  	}  	local_irq_restore(flags);  } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..980ca3ca643f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;  /* Number of rcu_nodes at specified level. */  int num_rcu_lvl[] = NUM_RCU_LVL_INIT;  int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; -/* Commandeer a sysrq key to dump RCU's tree. */ -static bool sysrq_rcu; -module_param(sysrq_rcu, bool, 0444);  /*   * The rcu_scheduler_active variable is initialized to the value @@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu);  /* rcuc/rcub kthread realtime priority */  static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444);  /* Delay in jiffies for grace-period initialization delays, debug only. */ @@ -406,7 +401,7 @@ static bool rcu_kick_kthreads;   */  static ulong jiffies_till_sched_qs = ULONG_MAX;  module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */  module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */  /* @@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void)  		WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);  		return;  	} +	/* Otherwise, set to third fqs scan, but bound below on large system. */  	j = READ_ONCE(jiffies_till_first_fqs) +  		      2 * READ_ONCE(jiffies_till_next_fqs);  	if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) @@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs)  }  /* - * Show the state of the grace-period kthreads. - */ -void show_rcu_gp_kthreads(void) -{ -	int cpu; -	unsigned long j; -	unsigned long ja; -	unsigned long jr; -	unsigned long jw; -	struct rcu_data *rdp; -	struct rcu_node *rnp; - -	j = jiffies; -	ja = j - READ_ONCE(rcu_state.gp_activity); -	jr = j - READ_ONCE(rcu_state.gp_req_activity); -	jw = j - READ_ONCE(rcu_state.gp_wake_time); -	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", -		rcu_state.name, gp_state_getname(rcu_state.gp_state), -		rcu_state.gp_state, -		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, -		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), -		(long)READ_ONCE(rcu_state.gp_seq), -		(long)READ_ONCE(rcu_get_root()->gp_seq_needed), -		READ_ONCE(rcu_state.gp_flags)); -	rcu_for_each_node_breadth_first(rnp) { -		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) -			continue; -		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", -			rnp->grplo, rnp->grphi, (long)rnp->gp_seq, -			(long)rnp->gp_seq_needed); -		if (!rcu_is_leaf_node(rnp)) -			continue; -		for_each_leaf_node_possible_cpu(rnp, cpu) { -			rdp = per_cpu_ptr(&rcu_data, cpu); -			if (rdp->gpwrap || -			    ULONG_CMP_GE(rcu_state.gp_seq, -					 rdp->gp_seq_needed)) -				continue; -			pr_info("\tcpu %d ->gp_seq_needed %ld\n", -				cpu, (long)rdp->gp_seq_needed); -		} -	} -	/* sched_show_task(rcu_state.gp_kthread); */ -} -EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); - -/* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) -{ -	show_rcu_gp_kthreads(); -} - -static struct sysrq_key_op sysrq_rcudump_op = { -	.handler = sysrq_show_rcu, -	.help_msg = "show-rcu(y)", -	.action_msg = "Show RCU tree", -	.enable_mask = SYSRQ_ENABLE_DUMP, -}; - -static int __init rcu_sysrq_init(void) -{ -	if (sysrq_rcu) -		return register_sysrq_key('y', &sysrq_rcudump_op); -	return 0; -} -early_initcall(rcu_sysrq_init); - -/*   * Send along grace-period-related data for rcutorture diagnostics.   */  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)  }  /* - * Handler for the irq_work request posted when a grace period has - * gone on for too long, but not yet long enough for an RCU CPU - * stall warning.  Set state appropriately, but just complain if - * there is unexpected state on entry. - */ -static void rcu_iw_handler(struct irq_work *iwp) -{ -	struct rcu_data *rdp; -	struct rcu_node *rnp; - -	rdp = container_of(iwp, struct rcu_data, rcu_iw); -	rnp = rdp->mynode; -	raw_spin_lock_rcu_node(rnp); -	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { -		rdp->rcu_iw_gp_seq = rnp->gp_seq; -		rdp->rcu_iw_pending = false; -	} -	raw_spin_unlock_rcu_node(rnp); -} - -/*   * Return true if the specified CPU has passed through a quiescent   * state by virtue of being in or having passed through an dynticks   * idle state since the last call to dyntick_save_progress_counter() @@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  	return 0;  } -static void record_gp_stall_check_time(void) -{ -	unsigned long j = jiffies; -	unsigned long j1; - -	rcu_state.gp_start = j; -	j1 = rcu_jiffies_till_stall_check(); -	/* Record ->gp_start before ->jiffies_stall. */ -	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ -	rcu_state.jiffies_resched = j + j1 / 2; -	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ -	struct task_struct *gpk = rcu_state.gp_kthread; -	unsigned long j; - -	j = jiffies - READ_ONCE(rcu_state.gp_activity); -	if (j > 2 * HZ) { -		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", -		       rcu_state.name, j, -		       (long)rcu_seq_current(&rcu_state.gp_seq), -		       READ_ONCE(rcu_state.gp_flags), -		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, -		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); -		if (gpk) { -			pr_err("RCU grace-period kthread stack dump:\n"); -			sched_show_task(gpk); -			wake_up_process(gpk); -		} -	} -} - -/* - * Dump stacks of all tasks running on stalled CPUs.  First try using - * NMIs, but fall back to manual remote stack tracing on architectures - * that don't support NMI-based stack dumps.  The NMI-triggered stack - * traces are more accurate because they are printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(void) -{ -	int cpu; -	unsigned long flags; -	struct rcu_node *rnp; - -	rcu_for_each_leaf_node(rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		for_each_leaf_node_possible_cpu(rnp, cpu) -			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) -				if (!trigger_single_cpu_backtrace(cpu)) -					dump_cpu_task(cpu); -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	} -} - -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ -	unsigned long j; - -	if (!rcu_kick_kthreads) -		return; -	j = READ_ONCE(rcu_state.jiffies_kick_kthreads); -	if (time_after(jiffies, j) && rcu_state.gp_kthread && -	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { -		WARN_ONCE(1, "Kicking %s grace-period kthread\n", -			  rcu_state.name); -		rcu_ftrace_dump(DUMP_ALL); -		wake_up_process(rcu_state.gp_kthread); -		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); -	} -} - -static void panic_on_rcu_stall(void) -{ -	if (sysctl_panic_on_rcu_stall) -		panic("RCU Stall\n"); -} - -static void print_other_cpu_stall(unsigned long gp_seq) -{ -	int cpu; -	unsigned long flags; -	unsigned long gpa; -	unsigned long j; -	int ndetected = 0; -	struct rcu_node *rnp = rcu_get_root(); -	long totqlen = 0; - -	/* Kick and suppress, if so configured. */ -	rcu_stall_kick_kthreads(); -	if (rcu_cpu_stall_suppress) -		return; - -	/* -	 * OK, time to rat on our buddy... -	 * See Documentation/RCU/stallwarn.txt for info on how to debug -	 * RCU CPU stall warnings. -	 */ -	pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); -	print_cpu_stall_info_begin(); -	rcu_for_each_leaf_node(rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		ndetected += rcu_print_task_stall(rnp); -		if (rnp->qsmask != 0) { -			for_each_leaf_node_possible_cpu(rnp, cpu) -				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { -					print_cpu_stall_info(cpu); -					ndetected++; -				} -		} -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	} - -	print_cpu_stall_info_end(); -	for_each_possible_cpu(cpu) -		totqlen += rcu_get_n_cbs_cpu(cpu); -	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", -	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start), -	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); -	if (ndetected) { -		rcu_dump_cpu_stacks(); - -		/* Complain about tasks blocking the grace period. */ -		rcu_print_detail_task_stall(); -	} else { -		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { -			pr_err("INFO: Stall ended before state dump start\n"); -		} else { -			j = jiffies; -			gpa = READ_ONCE(rcu_state.gp_activity); -			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", -			       rcu_state.name, j - gpa, j, gpa, -			       READ_ONCE(jiffies_till_next_fqs), -			       rcu_get_root()->qsmask); -			/* In this case, the current CPU might be at fault. */ -			sched_show_task(current); -		} -	} -	/* Rewrite if needed in case of slow consoles. */ -	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) -		WRITE_ONCE(rcu_state.jiffies_stall, -			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - -	rcu_check_gp_kthread_starvation(); - -	panic_on_rcu_stall(); - -	rcu_force_quiescent_state();  /* Kick them all. */ -} - -static void print_cpu_stall(void) -{ -	int cpu; -	unsigned long flags; -	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); -	struct rcu_node *rnp = rcu_get_root(); -	long totqlen = 0; - -	/* Kick and suppress, if so configured. */ -	rcu_stall_kick_kthreads(); -	if (rcu_cpu_stall_suppress) -		return; - -	/* -	 * OK, time to rat on ourselves... -	 * See Documentation/RCU/stallwarn.txt for info on how to debug -	 * RCU CPU stall warnings. -	 */ -	pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); -	print_cpu_stall_info_begin(); -	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); -	print_cpu_stall_info(smp_processor_id()); -	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); -	print_cpu_stall_info_end(); -	for_each_possible_cpu(cpu) -		totqlen += rcu_get_n_cbs_cpu(cpu); -	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", -		jiffies - rcu_state.gp_start, -		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - -	rcu_check_gp_kthread_starvation(); - -	rcu_dump_cpu_stacks(); - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	/* Rewrite if needed in case of slow consoles. */ -	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) -		WRITE_ONCE(rcu_state.jiffies_stall, -			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); -	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - -	panic_on_rcu_stall(); - -	/* -	 * Attempt to revive the RCU machinery by forcing a context switch. -	 * -	 * A context switch would normally allow the RCU state machine to make -	 * progress and it could be we're stuck in kernel space without context -	 * switches for an entirely unreasonable amount of time. -	 */ -	set_tsk_need_resched(current); -	set_preempt_need_resched(); -} - -static void check_cpu_stall(struct rcu_data *rdp) -{ -	unsigned long gs1; -	unsigned long gs2; -	unsigned long gps; -	unsigned long j; -	unsigned long jn; -	unsigned long js; -	struct rcu_node *rnp; - -	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || -	    !rcu_gp_in_progress()) -		return; -	rcu_stall_kick_kthreads(); -	j = jiffies; - -	/* -	 * Lots of memory barriers to reject false positives. -	 * -	 * The idea is to pick up rcu_state.gp_seq, then -	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally -	 * another copy of rcu_state.gp_seq.  These values are updated in -	 * the opposite order with memory barriers (or equivalent) during -	 * grace-period initialization and cleanup.  Now, a false positive -	 * can occur if we get an new value of rcu_state.gp_start and a old -	 * value of rcu_state.jiffies_stall.  But given the memory barriers, -	 * the only way that this can happen is if one grace period ends -	 * and another starts between these two fetches.  This is detected -	 * by comparing the second fetch of rcu_state.gp_seq with the -	 * previous fetch from rcu_state.gp_seq. -	 * -	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, -	 * and rcu_state.gp_start suffice to forestall false positives. -	 */ -	gs1 = READ_ONCE(rcu_state.gp_seq); -	smp_rmb(); /* Pick up ->gp_seq first... */ -	js = READ_ONCE(rcu_state.jiffies_stall); -	smp_rmb(); /* ...then ->jiffies_stall before the rest... */ -	gps = READ_ONCE(rcu_state.gp_start); -	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ -	gs2 = READ_ONCE(rcu_state.gp_seq); -	if (gs1 != gs2 || -	    ULONG_CMP_LT(j, js) || -	    ULONG_CMP_GE(gps, js)) -		return; /* No stall or GP completed since entering function. */ -	rnp = rdp->mynode; -	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; -	if (rcu_gp_in_progress() && -	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) && -	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - -		/* We haven't checked in, so go dump stack. */ -		print_cpu_stall(); - -	} else if (rcu_gp_in_progress() && -		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && -		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - -		/* They had a few time units to dump stack, so complain. */ -		print_other_cpu_stall(gs2); -	} -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ -	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); -} -  /* Trace-event wrapper function for trace_rcu_future_grace_period.  */  static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,  			      unsigned long gp_seq_req, const char *s) @@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)  static void rcu_gp_kthread_wake(void)  {  	if ((current == rcu_state.gp_kthread && -	     !in_interrupt() && !in_serving_softirq()) || +	     !in_irq() && !in_serving_softirq()) ||  	    !READ_ONCE(rcu_state.gp_flags) ||  	    !rcu_state.gp_kthread)  		return; @@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)  		return;  	}  	mask = rdp->grpmask; +	rdp->core_needs_qs = false;  	if ((rnp->qsmask & mask) == 0) {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	} else { -		rdp->core_needs_qs = false; -  		/*  		 * This GP can't end until cpu checks in, so all of our  		 * callbacks can be processed during the next GP. @@ -2352,14 +1969,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp)   */  int rcutree_dying_cpu(unsigned int cpu)  { -	RCU_TRACE(bool blkd;) -	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) -	RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) +	bool blkd; +	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); +	struct rcu_node *rnp = rdp->mynode;  	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))  		return 0; -	RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) +	blkd = !!(rnp->qsmask & rdp->grpmask);  	trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,  			       blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));  	return 0; @@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user)  }  /* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. + * Scan the leaf rcu_node structures.  For each structure on which all + * CPUs have reported a quiescent state and on which there are tasks + * blocking the current grace period, initiate RCU priority boosting. + * Otherwise, invoke the specified function to check dyntick state for + * each CPU that has not yet reported a quiescent state.   */  static void force_qs_rnp(int (*f)(struct rcu_data *rdp))  { @@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void)  }  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * This function checks for grace-period requests that fail to motivate - * RCU to come out of its idle mode. - */ -void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, -			 const unsigned long gpssdelay) -{ -	unsigned long flags; -	unsigned long j; -	struct rcu_node *rnp_root = rcu_get_root(); -	static atomic_t warned = ATOMIC_INIT(0); - -	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || -	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) -		return; -	j = jiffies; /* Expensive access, and in common case don't get here. */ -	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || -	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || -	    atomic_read(&warned)) -		return; - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	j = jiffies; -	if (rcu_gp_in_progress() || -	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || -	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || -	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || -	    atomic_read(&warned)) { -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	/* Hold onto the leaf lock to make others see warned==1. */ - -	if (rnp_root != rnp) -		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ -	j = jiffies; -	if (rcu_gp_in_progress() || -	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || -	    time_before(j, rcu_state.gp_req_activity + gpssdelay) || -	    time_before(j, rcu_state.gp_activity + gpssdelay) || -	    atomic_xchg(&warned, 1)) { -		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	WARN_ON(1); -	if (rnp_root != rnp) -		raw_spin_unlock_rcu_node(rnp_root); -	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	show_rcu_gp_kthreads(); -} - -/* - * Do a forward-progress check for rcutorture.  This is normally invoked - * due to an OOM event.  The argument "j" gives the time period during - * which rcutorture would like progress to have been made. - */ -void rcu_fwd_progress_check(unsigned long j) -{ -	unsigned long cbs; -	int cpu; -	unsigned long max_cbs = 0; -	int max_cpu = -1; -	struct rcu_data *rdp; - -	if (rcu_gp_in_progress()) { -		pr_info("%s: GP age %lu jiffies\n", -			__func__, jiffies - rcu_state.gp_start); -		show_rcu_gp_kthreads(); -	} else { -		pr_info("%s: Last GP end %lu jiffies ago\n", -			__func__, jiffies - rcu_state.gp_end); -		preempt_disable(); -		rdp = this_cpu_ptr(&rcu_data); -		rcu_check_gp_start_stall(rdp->mynode, rdp, j); -		preempt_enable(); -	} -	for_each_possible_cpu(cpu) { -		cbs = rcu_get_n_cbs_cpu(cpu); -		if (!cbs) -			continue; -		if (max_cpu < 0) -			pr_info("%s: callbacks", __func__); -		pr_cont(" %d: %lu", cpu, cbs); -		if (cbs <= max_cbs) -			continue; -		max_cbs = cbs; -		max_cpu = cpu; -	} -	if (max_cpu >= 0) -		pr_cont("\n"); -} -EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -  /* Perform RCU core processing work for the current CPU.  */  static __latent_entropy void rcu_core(struct softirq_action *unused)  { @@ -2870,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)  		 * Use rcu:rcu_callback trace event to find the previous  		 * time callback was passed to __call_rcu().  		 */ -		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", +		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",  			  head, head->func);  		WRITE_ONCE(head->func, rcu_leak_callback);  		return; @@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self,  	switch (action) {  	case PM_HIBERNATION_PREPARE:  	case PM_SUSPEND_PREPARE: -		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ -			rcu_expedite_gp(); +		rcu_expedite_gp();  		break;  	case PM_POST_HIBERNATION:  	case PM_POST_SUSPEND: -		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ -			rcu_unexpedite_gp(); +		rcu_unexpedite_gp();  		break;  	default:  		break; @@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void)  		jiffies_till_first_fqs = d;  	if (jiffies_till_next_fqs == ULONG_MAX)  		jiffies_till_next_fqs = d; -	if (jiffies_till_sched_qs == ULONG_MAX) -		adjust_jiffies_till_sched_qs(); +	adjust_jiffies_till_sched_qs();  	/* If the compile-time values are accurate, just leave. */  	if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -3858,5 +3377,6 @@ void __init rcu_init(void)  	srcu_init();  } +#include "tree_stall.h"  #include "tree_exp.h"  #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bb4f995f2d3f..e253d11af3c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;  int rcu_dynticks_snap(struct rcu_data *rdp); -/* Forward declarations for rcutree_plugin.h */ +/* Forward declarations for tree_plugin.h */  static void rcu_bootup_announce(void);  static void rcu_qs(void);  static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);  #ifdef CONFIG_HOTPLUG_CPU  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);  #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(void); -static int rcu_print_task_stall(struct rcu_node *rnp);  static int rcu_print_task_exp_stall(struct rcu_node *rnp);  static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);  static void rcu_flavor_sched_clock_irq(int user); @@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void);  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);  static bool rcu_preempt_need_deferred_qs(struct task_struct *t);  static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(int cpu); -static void print_cpu_stall_info_end(void);  static void zero_cpu_stall_ticks(struct rcu_data *rdp);  static bool rcu_nocb_cpu_needs_barrier(int cpu);  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); @@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void);  static bool rcu_nohz_full_cpu(void);  static void rcu_dynticks_task_enter(void);  static void rcu_dynticks_task_exit(void); + +/* Forward declarations for tree_stall.h */ +static void record_gp_stall_check_time(void); +static void rcu_iw_handler(struct irq_work *iwp); +static void check_cpu_stall(struct rcu_data *rdp); +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, +				     const unsigned long gpssdelay); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c2a0189e748..9c990df880d1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -10,6 +10,7 @@  #include <linux/lockdep.h>  static void rcu_exp_handler(void *unused); +static int rcu_print_task_exp_stall(struct rcu_node *rnp);  /*   * Record the start of an expedited grace period. @@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused)  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		if (rnp->expmask & rdp->grpmask) {  			rdp->deferred_qs = true; -			WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); +			t->rcu_read_unlock_special.b.exp_hint = true;  		}  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  		return; @@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused)  	 *  	 * If the CPU is fully enabled (or if some buggy RCU-preempt  	 * read-side critical section is being used from idle), just -	 * invoke rcu_preempt_defer_qs() to immediately report the +	 * invoke rcu_preempt_deferred_qs() to immediately report the  	 * quiescent state.  We cannot use rcu_read_unlock_special()  	 * because we are in an interrupt handler, which will cause that  	 * function to take an early exit without doing anything. @@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)  {  } +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ +	struct task_struct *t; +	int ndetected = 0; + +	if (!rnp->exp_tasks) +		return 0; +	t = list_entry(rnp->exp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		pr_cont(" P%d", t->pid); +		ndetected++; +	} +	return ndetected; +} +  #else /* #ifdef CONFIG_PREEMPT_RCU */  /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)  	WARN_ON_ONCE(ret);  } +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ +	return 0; +} +  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */  /** diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97dba50f6fb2..1102765f91fd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -285,7 +285,7 @@ static void rcu_qs(void)  				       TPS("cpuqs"));  		__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);  		barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ -		current->rcu_read_unlock_special.b.need_qs = false; +		WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);  	}  } @@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t)  }  /* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ -	unsigned long flags; -	struct task_struct *t; - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	if (!rcu_preempt_blocked_readers_cgp(rnp)) { -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	t = list_entry(rnp->gp_tasks->prev, -		       struct task_struct, rcu_node_entry); -	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		/* -		 * We could be printing a lot while holding a spinlock. -		 * Avoid triggering hard lockup. -		 */ -		touch_nmi_watchdog(); -		sched_show_task(t); -	} -	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ -	struct rcu_node *rnp = rcu_get_root(); - -	rcu_print_detail_task_stall_rnp(rnp); -	rcu_for_each_leaf_node(rnp) -		rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", -	       rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ -	pr_cont("\n"); -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ -	struct task_struct *t; -	int ndetected = 0; - -	if (!rcu_preempt_blocked_readers_cgp(rnp)) -		return 0; -	rcu_print_task_stall_begin(rnp); -	t = list_entry(rnp->gp_tasks->prev, -		       struct task_struct, rcu_node_entry); -	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		pr_cont(" P%d", t->pid); -		ndetected++; -	} -	rcu_print_task_stall_end(); -	return ndetected; -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each that is blocking the current - * expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ -	struct task_struct *t; -	int ndetected = 0; - -	if (!rnp->exp_tasks) -		return 0; -	t = list_entry(rnp->exp_tasks->prev, -		       struct task_struct, rcu_node_entry); -	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		pr_cont(" P%d", t->pid); -		ndetected++; -	} -	return ndetected; -} - -/*   * Check that the list of blocked tasks for the newly completed grace   * period is in fact empty.  It is a serious bug to complete a grace   * period that still has RCU readers blocked!  This function must be @@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user)  /*   * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so.  No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so.  No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort.  Spewing warnings from this + * function is like as not to simply obscure important prior warnings.   */  void exit_rcu(void)  {  	struct task_struct *t = current; -	if (likely(list_empty(¤t->rcu_node_entry))) +	if (unlikely(!list_empty(¤t->rcu_node_entry))) { +		t->rcu_read_lock_nesting = 1; +		barrier(); +		WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); +	} else if (unlikely(t->rcu_read_lock_nesting)) { +		t->rcu_read_lock_nesting = 1; +	} else {  		return; -	t->rcu_read_lock_nesting = 1; -	barrier(); -	t->rcu_read_unlock_special.b.blocked = true; +	}  	__rcu_read_unlock();  	rcu_preempt_deferred_qs(current);  } @@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)  static void rcu_preempt_deferred_qs(struct task_struct *t) { }  /* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(void) -{ -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ -	return 0; -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections that are - * blocking the current expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ -	return 0; -} - -/*   * Because there is no preemptible RCU, there can be no readers blocked,   * so there is no need to check for blocked tasks.  So check only for   * bogus qsmask values. @@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg)  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)  	__releases(rnp->lock)  { -	struct task_struct *t; -  	raw_lockdep_assert_held_rcu_node(rnp);  	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)  		if (rnp->exp_tasks == NULL)  			rnp->boost_tasks = rnp->gp_tasks;  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		t = rnp->boost_kthread_task; -		if (t) -			rcu_wake_cond(t, rnp->boost_kthread_status); +		rcu_wake_cond(rnp->boost_kthread_task, +			      rnp->boost_kthread_status);  	} else {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	} @@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void)  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ -	struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - -	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", -		rdp->last_accelerate & 0xffff, jiffies & 0xffff, -		".l"[rdp->all_lazy], -		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], -		".D"[!rdp->tick_nohz_enabled_snap]); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ -	*cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ -	pr_cont("\n"); -} - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period, then - * print the number of scheduling clock interrupts the CPU has taken - * during the time that it has been aware.  Otherwise, print the number - * of RCU grace periods that this CPU is ignorant of, for example, "1" - * if the CPU was aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(int cpu) -{ -	unsigned long delta; -	char fast_no_hz[72]; -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	char *ticks_title; -	unsigned long ticks_value; - -	/* -	 * We could be printing a lot while holding a spinlock.  Avoid -	 * triggering hard lockup. -	 */ -	touch_nmi_watchdog(); - -	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); -	if (ticks_value) { -		ticks_title = "GPs behind"; -	} else { -		ticks_title = "ticks this GP"; -		ticks_value = rdp->ticks_this_gp; -	} -	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); -	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); -	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", -	       cpu, -	       "O."[!!cpu_online(cpu)], -	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], -	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], -	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : -			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : -				"!."[!delta], -	       ticks_value, ticks_title, -	       rcu_dynticks_snap(rdp) & 0xfff, -	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, -	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), -	       READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, -	       fast_no_hz); -} - -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ -	pr_err("\t"); -} - -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -	rdp->ticks_this_gp = 0; -	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); -	WRITE_ONCE(rdp->last_fqs_resched, jiffies); -} -  #ifdef CONFIG_RCU_NOCB_CPU  /* @@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)   */ -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a + * comma-separated list of CPUs and/or CPU ranges.  If an invalid list is + * given, a warning is emitted and all CPUs are offloaded. + */  static int __init rcu_nocb_setup(char *str)  {  	alloc_bootmem_cpumask_var(&rcu_nocb_mask); -	cpulist_parse(str, rcu_nocb_mask); +	if (!strcasecmp(str, "all")) +		cpumask_setall(rcu_nocb_mask); +	else +		if (cpulist_parse(str, rcu_nocb_mask)) { +			pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); +			cpumask_setall(rcu_nocb_mask); +		}  	return 1;  }  __setup("rcu_nocbs=", rcu_nocb_setup); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..f65a73a97323 --- /dev/null +++ b/kernel/rcu/tree_stall.h @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RCU CPU stall warnings for normal RCU grace periods + * + * Copyright IBM Corporation, 2019 + * + * Author: Paul E. McKenney <paulmck@linux.ibm.com> + */ + +////////////////////////////////////////////////////////////////////////////// +// +// Controlling CPU stall warnings, including delay calculation. + +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA	       (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA	       0 +#endif + +/* Limit-check stall timeouts specified at boottime and runtime. */ +int rcu_jiffies_till_stall_check(void) +{ +	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); + +	/* +	 * Limit check must be consistent with the Kconfig limits +	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. +	 */ +	if (till_stall_check < 3) { +		WRITE_ONCE(rcu_cpu_stall_timeout, 3); +		till_stall_check = 3; +	} else if (till_stall_check > 300) { +		WRITE_ONCE(rcu_cpu_stall_timeout, 300); +		till_stall_check = 300; +	} +	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); + +/* Don't do RCU CPU stall warnings during long sysrq printouts. */ +void rcu_sysrq_start(void) +{ +	if (!rcu_cpu_stall_suppress) +		rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ +	if (rcu_cpu_stall_suppress == 2) +		rcu_cpu_stall_suppress = 0; +} + +/* Don't print RCU CPU stall warnings during a kernel panic. */ +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ +	rcu_cpu_stall_suppress = 1; +	return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { +	.notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ +	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); +	return 0; +} +early_initcall(check_cpu_stall_init); + +/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ +static void panic_on_rcu_stall(void) +{ +	if (sysctl_panic_on_rcu_stall) +		panic("RCU Stall\n"); +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ +	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Interaction with RCU grace periods + +/* Start of new grace period, so record stall time (and forcing times). */ +static void record_gp_stall_check_time(void) +{ +	unsigned long j = jiffies; +	unsigned long j1; + +	rcu_state.gp_start = j; +	j1 = rcu_jiffies_till_stall_check(); +	/* Record ->gp_start before ->jiffies_stall. */ +	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ +	rcu_state.jiffies_resched = j + j1 / 2; +	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +	rdp->ticks_this_gp = 0; +	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); +	WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ +	unsigned long j; + +	if (!rcu_kick_kthreads) +		return; +	j = READ_ONCE(rcu_state.jiffies_kick_kthreads); +	if (time_after(jiffies, j) && rcu_state.gp_kthread && +	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { +		WARN_ONCE(1, "Kicking %s grace-period kthread\n", +			  rcu_state.name); +		rcu_ftrace_dump(DUMP_ALL); +		wake_up_process(rcu_state.gp_kthread); +		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); +	} +} + +/* + * Handler for the irq_work request posted about halfway into the RCU CPU + * stall timeout, and used to detect excessive irq disabling.  Set state + * appropriately, but just complain if there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ +	struct rcu_data *rdp; +	struct rcu_node *rnp; + +	rdp = container_of(iwp, struct rcu_data, rcu_iw); +	rnp = rdp->mynode; +	raw_spin_lock_rcu_node(rnp); +	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { +		rdp->rcu_iw_gp_seq = rnp->gp_seq; +		rdp->rcu_iw_pending = false; +	} +	raw_spin_unlock_rcu_node(rnp); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Printing RCU CPU stall warnings + +#ifdef CONFIG_PREEMPT + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +	unsigned long flags; +	struct task_struct *t; + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	if (!rcu_preempt_blocked_readers_cgp(rnp)) { +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	t = list_entry(rnp->gp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		/* +		 * We could be printing a lot while holding a spinlock. +		 * Avoid triggering hard lockup. +		 */ +		touch_nmi_watchdog(); +		sched_show_task(t); +	} +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ +	struct task_struct *t; +	int ndetected = 0; + +	if (!rcu_preempt_blocked_readers_cgp(rnp)) +		return 0; +	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", +	       rnp->level, rnp->grplo, rnp->grphi); +	t = list_entry(rnp->gp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		pr_cont(" P%d", t->pid); +		ndetected++; +	} +	pr_cont("\n"); +	return ndetected; +} + +#else /* #ifdef CONFIG_PREEMPT */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ +	return 0; +} +#endif /* #else #ifdef CONFIG_PREEMPT */ + +/* + * Dump stacks of all tasks running on stalled CPUs.  First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps.  The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(void) +{ +	int cpu; +	unsigned long flags; +	struct rcu_node *rnp; + +	rcu_for_each_leaf_node(rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); +		for_each_leaf_node_possible_cpu(rnp, cpu) +			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) +				if (!trigger_single_cpu_backtrace(cpu)) +					dump_cpu_task(cpu); +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	} +} + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +	struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + +	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", +		rdp->last_accelerate & 0xffff, jiffies & 0xffff, +		".l"[rdp->all_lazy], +		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], +		".D"[!!rdp->tick_nohz_enabled_snap]); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +	*cp = '\0'; +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware.  Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(int cpu) +{ +	unsigned long delta; +	char fast_no_hz[72]; +	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); +	char *ticks_title; +	unsigned long ticks_value; + +	/* +	 * We could be printing a lot while holding a spinlock.  Avoid +	 * triggering hard lockup. +	 */ +	touch_nmi_watchdog(); + +	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); +	if (ticks_value) { +		ticks_title = "GPs behind"; +	} else { +		ticks_title = "ticks this GP"; +		ticks_value = rdp->ticks_this_gp; +	} +	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); +	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); +	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", +	       cpu, +	       "O."[!!cpu_online(cpu)], +	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], +	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], +	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : +			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : +				"!."[!delta], +	       ticks_value, ticks_title, +	       rcu_dynticks_snap(rdp) & 0xfff, +	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, +	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), +	       READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, +	       fast_no_hz); +} + +/* Complain about starvation of grace-period kthread.  */ +static void rcu_check_gp_kthread_starvation(void) +{ +	struct task_struct *gpk = rcu_state.gp_kthread; +	unsigned long j; + +	j = jiffies - READ_ONCE(rcu_state.gp_activity); +	if (j > 2 * HZ) { +		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", +		       rcu_state.name, j, +		       (long)rcu_seq_current(&rcu_state.gp_seq), +		       READ_ONCE(rcu_state.gp_flags), +		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, +		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); +		if (gpk) { +			pr_err("RCU grace-period kthread stack dump:\n"); +			sched_show_task(gpk); +			wake_up_process(gpk); +		} +	} +} + +static void print_other_cpu_stall(unsigned long gp_seq) +{ +	int cpu; +	unsigned long flags; +	unsigned long gpa; +	unsigned long j; +	int ndetected = 0; +	struct rcu_node *rnp; +	long totqlen = 0; + +	/* Kick and suppress, if so configured. */ +	rcu_stall_kick_kthreads(); +	if (rcu_cpu_stall_suppress) +		return; + +	/* +	 * OK, time to rat on our buddy... +	 * See Documentation/RCU/stallwarn.txt for info on how to debug +	 * RCU CPU stall warnings. +	 */ +	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); +	rcu_for_each_leaf_node(rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); +		ndetected += rcu_print_task_stall(rnp); +		if (rnp->qsmask != 0) { +			for_each_leaf_node_possible_cpu(rnp, cpu) +				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { +					print_cpu_stall_info(cpu); +					ndetected++; +				} +		} +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	} + +	for_each_possible_cpu(cpu) +		totqlen += rcu_get_n_cbs_cpu(cpu); +	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", +	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start), +	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); +	if (ndetected) { +		rcu_dump_cpu_stacks(); + +		/* Complain about tasks blocking the grace period. */ +		rcu_for_each_leaf_node(rnp) +			rcu_print_detail_task_stall_rnp(rnp); +	} else { +		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { +			pr_err("INFO: Stall ended before state dump start\n"); +		} else { +			j = jiffies; +			gpa = READ_ONCE(rcu_state.gp_activity); +			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", +			       rcu_state.name, j - gpa, j, gpa, +			       READ_ONCE(jiffies_till_next_fqs), +			       rcu_get_root()->qsmask); +			/* In this case, the current CPU might be at fault. */ +			sched_show_task(current); +		} +	} +	/* Rewrite if needed in case of slow consoles. */ +	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) +		WRITE_ONCE(rcu_state.jiffies_stall, +			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + +	rcu_check_gp_kthread_starvation(); + +	panic_on_rcu_stall(); + +	rcu_force_quiescent_state();  /* Kick them all. */ +} + +static void print_cpu_stall(void) +{ +	int cpu; +	unsigned long flags; +	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); +	struct rcu_node *rnp = rcu_get_root(); +	long totqlen = 0; + +	/* Kick and suppress, if so configured. */ +	rcu_stall_kick_kthreads(); +	if (rcu_cpu_stall_suppress) +		return; + +	/* +	 * OK, time to rat on ourselves... +	 * See Documentation/RCU/stallwarn.txt for info on how to debug +	 * RCU CPU stall warnings. +	 */ +	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); +	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); +	print_cpu_stall_info(smp_processor_id()); +	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); +	for_each_possible_cpu(cpu) +		totqlen += rcu_get_n_cbs_cpu(cpu); +	pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", +		jiffies - rcu_state.gp_start, +		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + +	rcu_check_gp_kthread_starvation(); + +	rcu_dump_cpu_stacks(); + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	/* Rewrite if needed in case of slow consoles. */ +	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) +		WRITE_ONCE(rcu_state.jiffies_stall, +			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + +	panic_on_rcu_stall(); + +	/* +	 * Attempt to revive the RCU machinery by forcing a context switch. +	 * +	 * A context switch would normally allow the RCU state machine to make +	 * progress and it could be we're stuck in kernel space without context +	 * switches for an entirely unreasonable amount of time. +	 */ +	set_tsk_need_resched(current); +	set_preempt_need_resched(); +} + +static void check_cpu_stall(struct rcu_data *rdp) +{ +	unsigned long gs1; +	unsigned long gs2; +	unsigned long gps; +	unsigned long j; +	unsigned long jn; +	unsigned long js; +	struct rcu_node *rnp; + +	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || +	    !rcu_gp_in_progress()) +		return; +	rcu_stall_kick_kthreads(); +	j = jiffies; + +	/* +	 * Lots of memory barriers to reject false positives. +	 * +	 * The idea is to pick up rcu_state.gp_seq, then +	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally +	 * another copy of rcu_state.gp_seq.  These values are updated in +	 * the opposite order with memory barriers (or equivalent) during +	 * grace-period initialization and cleanup.  Now, a false positive +	 * can occur if we get an new value of rcu_state.gp_start and a old +	 * value of rcu_state.jiffies_stall.  But given the memory barriers, +	 * the only way that this can happen is if one grace period ends +	 * and another starts between these two fetches.  This is detected +	 * by comparing the second fetch of rcu_state.gp_seq with the +	 * previous fetch from rcu_state.gp_seq. +	 * +	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, +	 * and rcu_state.gp_start suffice to forestall false positives. +	 */ +	gs1 = READ_ONCE(rcu_state.gp_seq); +	smp_rmb(); /* Pick up ->gp_seq first... */ +	js = READ_ONCE(rcu_state.jiffies_stall); +	smp_rmb(); /* ...then ->jiffies_stall before the rest... */ +	gps = READ_ONCE(rcu_state.gp_start); +	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ +	gs2 = READ_ONCE(rcu_state.gp_seq); +	if (gs1 != gs2 || +	    ULONG_CMP_LT(j, js) || +	    ULONG_CMP_GE(gps, js)) +		return; /* No stall or GP completed since entering function. */ +	rnp = rdp->mynode; +	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; +	if (rcu_gp_in_progress() && +	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) && +	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + +		/* We haven't checked in, so go dump stack. */ +		print_cpu_stall(); + +	} else if (rcu_gp_in_progress() && +		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && +		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + +		/* They had a few time units to dump stack, so complain. */ +		print_other_cpu_stall(gs2); +	} +} + +////////////////////////////////////////////////////////////////////////////// +// +// RCU forward-progress mechanisms, including of callback invocation. + + +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ +	int cpu; +	unsigned long j; +	unsigned long ja; +	unsigned long jr; +	unsigned long jw; +	struct rcu_data *rdp; +	struct rcu_node *rnp; + +	j = jiffies; +	ja = j - READ_ONCE(rcu_state.gp_activity); +	jr = j - READ_ONCE(rcu_state.gp_req_activity); +	jw = j - READ_ONCE(rcu_state.gp_wake_time); +	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", +		rcu_state.name, gp_state_getname(rcu_state.gp_state), +		rcu_state.gp_state, +		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, +		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), +		(long)READ_ONCE(rcu_state.gp_seq), +		(long)READ_ONCE(rcu_get_root()->gp_seq_needed), +		READ_ONCE(rcu_state.gp_flags)); +	rcu_for_each_node_breadth_first(rnp) { +		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) +			continue; +		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", +			rnp->grplo, rnp->grphi, (long)rnp->gp_seq, +			(long)rnp->gp_seq_needed); +		if (!rcu_is_leaf_node(rnp)) +			continue; +		for_each_leaf_node_possible_cpu(rnp, cpu) { +			rdp = per_cpu_ptr(&rcu_data, cpu); +			if (rdp->gpwrap || +			    ULONG_CMP_GE(rcu_state.gp_seq, +					 rdp->gp_seq_needed)) +				continue; +			pr_info("\tcpu %d ->gp_seq_needed %ld\n", +				cpu, (long)rdp->gp_seq_needed); +		} +	} +	/* sched_show_task(rcu_state.gp_kthread); */ +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, +				     const unsigned long gpssdelay) +{ +	unsigned long flags; +	unsigned long j; +	struct rcu_node *rnp_root = rcu_get_root(); +	static atomic_t warned = ATOMIC_INIT(0); + +	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || +	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) +		return; +	j = jiffies; /* Expensive access, and in common case don't get here. */ +	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || +	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || +	    atomic_read(&warned)) +		return; + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	j = jiffies; +	if (rcu_gp_in_progress() || +	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || +	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || +	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || +	    atomic_read(&warned)) { +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	/* Hold onto the leaf lock to make others see warned==1. */ + +	if (rnp_root != rnp) +		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ +	j = jiffies; +	if (rcu_gp_in_progress() || +	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || +	    time_before(j, rcu_state.gp_req_activity + gpssdelay) || +	    time_before(j, rcu_state.gp_activity + gpssdelay) || +	    atomic_xchg(&warned, 1)) { +		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	WARN_ON(1); +	if (rnp_root != rnp) +		raw_spin_unlock_rcu_node(rnp_root); +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	show_rcu_gp_kthreads(); +} + +/* + * Do a forward-progress check for rcutorture.  This is normally invoked + * due to an OOM event.  The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ +	unsigned long cbs; +	int cpu; +	unsigned long max_cbs = 0; +	int max_cpu = -1; +	struct rcu_data *rdp; + +	if (rcu_gp_in_progress()) { +		pr_info("%s: GP age %lu jiffies\n", +			__func__, jiffies - rcu_state.gp_start); +		show_rcu_gp_kthreads(); +	} else { +		pr_info("%s: Last GP end %lu jiffies ago\n", +			__func__, jiffies - rcu_state.gp_end); +		preempt_disable(); +		rdp = this_cpu_ptr(&rcu_data); +		rcu_check_gp_start_stall(rdp->mynode, rdp, j); +		preempt_enable(); +	} +	for_each_possible_cpu(cpu) { +		cbs = rcu_get_n_cbs_cpu(cpu); +		if (!cbs) +			continue; +		if (max_cpu < 0) +			pr_info("%s: callbacks", __func__); +		pr_cont(" %d: %lu", cpu, cbs); +		if (cbs <= max_cbs) +			continue; +		max_cbs = cbs; +		max_cpu = cpu; +	} +	if (max_cpu >= 0) +		pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); + +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ +	show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { +	.handler = sysrq_show_rcu, +	.help_msg = "show-rcu(y)", +	.action_msg = "Show RCU tree", +	.enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ +	if (sysrq_rcu) +		return register_sysrq_key('y', &sysrq_rcudump_op); +	return 0; +} +early_initcall(rcu_sysrq_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index cbaa976c5945..c3bf44ba42e5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);  #endif  #ifdef CONFIG_RCU_STALL_COMMON - -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA	       (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA	       0 -#endif -  int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */  EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; -  module_param(rcu_cpu_stall_suppress, int, 0644); +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;  module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ -	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); - -	/* -	 * Limit check must be consistent with the Kconfig limits -	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. -	 */ -	if (till_stall_check < 3) { -		WRITE_ONCE(rcu_cpu_stall_timeout, 3); -		till_stall_check = 3; -	} else if (till_stall_check > 300) { -		WRITE_ONCE(rcu_cpu_stall_timeout, 300); -		till_stall_check = 300; -	} -	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); - -void rcu_sysrq_start(void) -{ -	if (!rcu_cpu_stall_suppress) -		rcu_cpu_stall_suppress = 2; -} - -void rcu_sysrq_end(void) -{ -	if (rcu_cpu_stall_suppress == 2) -		rcu_cpu_stall_suppress = 0; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ -	rcu_cpu_stall_suppress = 1; -	return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { -	.notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ -	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -	return 0; -} -early_initcall(check_cpu_stall_init); -  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */  #ifdef CONFIG_TASKS_RCU diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..c4d472b7f1b4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/reboot.c   * @@ -31,6 +32,7 @@ EXPORT_SYMBOL(cad_pid);  #define DEFAULT_REBOOT_MODE  #endif  enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;  /*   * This variable is used privately to keep track of whether or not @@ -519,6 +521,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);  static int __init reboot_setup(char *str)  {  	for (;;) { +		enum reboot_mode *mode; +  		/*  		 * Having anything passed on the command line via  		 * reboot= will cause us to disable DMI checking @@ -526,17 +530,24 @@ static int __init reboot_setup(char *str)  		 */  		reboot_default = 0; +		if (!strncmp(str, "panic_", 6)) { +			mode = &panic_reboot_mode; +			str += 6; +		} else { +			mode = &reboot_mode; +		} +  		switch (*str) {  		case 'w': -			reboot_mode = REBOOT_WARM; +			*mode = REBOOT_WARM;  			break;  		case 'c': -			reboot_mode = REBOOT_COLD; +			*mode = REBOOT_COLD;  			break;  		case 'h': -			reboot_mode = REBOOT_HARD; +			*mode = REBOOT_HARD;  			break;  		case 's': @@ -553,11 +564,11 @@ static int __init reboot_setup(char *str)  				if (rc)  					return rc;  			} else -				reboot_mode = REBOOT_SOFT; +				*mode = REBOOT_SOFT;  			break;  		}  		case 'g': -			reboot_mode = REBOOT_GPIO; +			*mode = REBOOT_GPIO;  			break;  		case 'b': diff --git a/kernel/resource.c b/kernel/resource.c index 92190f62ebc5..158f04ec1d4f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *	linux/kernel/resource.c   * @@ -520,21 +521,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);  int region_intersects(resource_size_t start, size_t size, unsigned long flags,  		      unsigned long desc)  { -	resource_size_t end = start + size - 1; +	struct resource res;  	int type = 0; int other = 0;  	struct resource *p; +	res.start = start; +	res.end = start + size - 1; +  	read_lock(&resource_lock);  	for (p = iomem_resource.child; p ; p = p->sibling) {  		bool is_type = (((p->flags & flags) == flags) &&  				((desc == IORES_DESC_NONE) ||  				 (desc == p->desc))); -		if (start >= p->start && start <= p->end) -			is_type ? type++ : other++; -		if (end >= p->start && end <= p->end) -			is_type ? type++ : other++; -		if (p->start >= start && p->end <= end) +		if (resource_overlaps(p, &res))  			is_type ? type++ : other++;  	}  	read_unlock(&resource_lock); diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..9424ee90589e 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)   * - signal delivery,   * and return to user-space.   * - * This is how we can ensure that the entire rseq critical section, - * consisting of both the C part and the assembly instruction sequence, + * This is how we can ensure that the entire rseq critical section   * will issue the commit instruction only if executed atomically with   * respect to other threads scheduled on the same CPU, and with respect   * to signal handlers. @@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		/* Unregister rseq for current thread. */  		if (current->rseq != rseq || !current->rseq)  			return -EINVAL; -		if (current->rseq_len != rseq_len) +		if (rseq_len != sizeof(*rseq))  			return -EINVAL;  		if (current->rseq_sig != sig)  			return -EPERM; @@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		if (ret)  			return ret;  		current->rseq = NULL; -		current->rseq_len = 0;  		current->rseq_sig = 0;  		return 0;  	} @@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		 * the provided address differs from the prior  		 * one.  		 */ -		if (current->rseq != rseq || current->rseq_len != rseq_len) +		if (current->rseq != rseq || rseq_len != sizeof(*rseq))  			return -EINVAL;  		if (current->rseq_sig != sig)  			return -EPERM; @@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  	if (!access_ok(rseq, rseq_len))  		return -EFAULT;  	current->rseq = rseq; -	current->rseq_len = rseq_len;  	current->rseq_sig = sig;  	/*  	 * If rseq was previously inactive, and has just been diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e3e3b979f9bd..1152259a4ca0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * sched_clock() for unstable CPU clocks   * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ead464a0f2e5..874c427742a9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  kernel/sched/core.c   * @@ -792,10 +793,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)  		rq->nr_uninterruptible--;  	enqueue_task(rq, p, flags); + +	p->on_rq = TASK_ON_RQ_QUEUED;  }  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)  { +	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; +  	if (task_contributes_to_load(p))  		rq->nr_uninterruptible++; @@ -920,7 +925,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)  }  /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see   * __set_cpus_allowed_ptr() and select_fallback_rq().   */  static inline bool is_cpu_allowed(struct task_struct *p, int cpu) @@ -1151,7 +1156,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  		/* Need help from migration thread: drop lock and wait. */  		task_rq_unlock(rq, p, &rf);  		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -		tlb_migrate_finish(p->mm);  		return 0;  	} else if (task_on_rq_queued(p)) {  		/* @@ -1237,11 +1241,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)  		rq_pin_lock(src_rq, &srf);  		rq_pin_lock(dst_rq, &drf); -		p->on_rq = TASK_ON_RQ_MIGRATING;  		deactivate_task(src_rq, p, 0);  		set_task_cpu(p, cpu);  		activate_task(dst_rq, p, 0); -		p->on_rq = TASK_ON_RQ_QUEUED;  		check_preempt_curr(dst_rq, p, 0);  		rq_unpin_lock(dst_rq, &drf); @@ -1681,16 +1683,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  		__schedstat_inc(p->se.statistics.nr_wakeups_sync);  } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ -	activate_task(rq, p, en_flags); -	p->on_rq = TASK_ON_RQ_QUEUED; - -	/* If a worker is waking up, notify the workqueue: */ -	if (p->flags & PF_WQ_WORKER) -		wq_worker_waking_up(p, cpu_of(rq)); -} -  /*   * Mark the task runnable and perform wakeup-preemption.   */ @@ -1742,7 +1734,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,  		en_flags |= ENQUEUE_MIGRATED;  #endif -	ttwu_activate(rq, p, en_flags); +	activate_task(rq, p, en_flags);  	ttwu_do_wakeup(rq, p, wake_flags, rf);  } @@ -2107,56 +2099,6 @@ out:  }  /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) -{ -	struct rq *rq = task_rq(p); - -	if (WARN_ON_ONCE(rq != this_rq()) || -	    WARN_ON_ONCE(p == current)) -		return; - -	lockdep_assert_held(&rq->lock); - -	if (!raw_spin_trylock(&p->pi_lock)) { -		/* -		 * This is OK, because current is on_cpu, which avoids it being -		 * picked for load-balance and preemption/IRQs are still -		 * disabled avoiding further scheduler activity on it and we've -		 * not yet picked a replacement task. -		 */ -		rq_unlock(rq, rf); -		raw_spin_lock(&p->pi_lock); -		rq_relock(rq, rf); -	} - -	if (!(p->state & TASK_NORMAL)) -		goto out; - -	trace_sched_waking(p); - -	if (!task_on_rq_queued(p)) { -		if (p->in_iowait) { -			delayacct_blkio_end(p); -			atomic_dec(&rq->nr_iowait); -		} -		ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); -	} - -	ttwu_do_wakeup(rq, p, 0, rf); -	ttwu_stat(p, smp_processor_id(), 0); -out: -	raw_spin_unlock(&p->pi_lock); -} - -/**   * wake_up_process - Wake up a specific process   * @p: The process to be woken up.   * @@ -2467,7 +2409,6 @@ void wake_up_new_task(struct task_struct *p)  	post_init_entity_util_avg(p);  	activate_task(rq, p, ENQUEUE_NOCLOCK); -	p->on_rq = TASK_ON_RQ_QUEUED;  	trace_sched_wakeup_new(p);  	check_preempt_curr(rq, p, WF_FORK);  #ifdef CONFIG_SMP @@ -3466,25 +3407,11 @@ static void __sched notrace __schedule(bool preempt)  			prev->state = TASK_RUNNING;  		} else {  			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); -			prev->on_rq = 0;  			if (prev->in_iowait) {  				atomic_inc(&rq->nr_iowait);  				delayacct_blkio_start();  			} - -			/* -			 * If a worker went to sleep, notify and ask workqueue -			 * whether it wants to wake up a task to maintain -			 * concurrency. -			 */ -			if (prev->flags & PF_WQ_WORKER) { -				struct task_struct *to_wakeup; - -				to_wakeup = wq_worker_sleeping(prev); -				if (to_wakeup) -					try_to_wake_up_local(to_wakeup, &rf); -			}  		}  		switch_count = &prev->nvcsw;  	} @@ -3544,6 +3471,20 @@ static inline void sched_submit_work(struct task_struct *tsk)  {  	if (!tsk->state || tsk_is_pi_blocked(tsk))  		return; + +	/* +	 * If a worker went to sleep, notify and ask workqueue whether +	 * it wants to wake up a task to maintain concurrency. +	 * As this function is called inside the schedule() context, +	 * we disable preemption to avoid it calling schedule() again +	 * in the possible wakeup of a kworker. +	 */ +	if (tsk->flags & PF_WQ_WORKER) { +		preempt_disable(); +		wq_worker_sleeping(tsk); +		preempt_enable_no_resched(); +	} +  	/*  	 * If we are going to sleep and we have plugged IO queued,  	 * make sure to submit it to avoid deadlocks. @@ -3552,6 +3493,12 @@ static inline void sched_submit_work(struct task_struct *tsk)  		blk_schedule_flush_plug(tsk);  } +static void sched_update_worker(struct task_struct *tsk) +{ +	if (tsk->flags & PF_WQ_WORKER) +		wq_worker_running(tsk); +} +  asmlinkage __visible void __sched schedule(void)  {  	struct task_struct *tsk = current; @@ -3562,6 +3509,7 @@ asmlinkage __visible void __sched schedule(void)  		__schedule(false);  		sched_preempt_enable_no_resched();  	} while (need_resched()); +	sched_update_worker(tsk);  }  EXPORT_SYMBOL(schedule); @@ -5918,7 +5866,7 @@ void __init sched_init_smp(void)  static int __init migration_init(void)  { -	sched_rq_cpu_starting(smp_processor_id()); +	sched_cpu_starting(smp_processor_id());  	return 0;  }  early_initcall(migration_init); @@ -6559,6 +6507,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,  				struct cftype *cftype, u64 shareval)  { +	if (shareval > scale_load_down(ULONG_MAX)) +		shareval = MAX_SHARES;  	return sched_group_set_shares(css_tg(css), scale_load(shareval));  } @@ -6574,7 +6524,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,  static DEFINE_MUTEX(cfs_constraints_mutex);  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -6654,20 +6604,22 @@ out_unlock:  	return ret;  } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)  {  	u64 quota, period;  	period = ktime_to_ns(tg->cfs_bandwidth.period);  	if (cfs_quota_us < 0)  		quota = RUNTIME_INF; -	else +	else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)  		quota = (u64)cfs_quota_us * NSEC_PER_USEC; +	else +		return -EINVAL;  	return tg_set_cfs_bandwidth(tg, period, quota);  } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg)  {  	u64 quota_us; @@ -6680,17 +6632,20 @@ long tg_get_cfs_quota(struct task_group *tg)  	return quota_us;  } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)  {  	u64 quota, period; +	if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) +		return -EINVAL; +  	period = (u64)cfs_period_us * NSEC_PER_USEC;  	quota = tg->cfs_bandwidth.quota;  	return tg_set_cfs_bandwidth(tg, period, quota);  } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg)  {  	u64 cfs_period_us; @@ -6998,7 +6953,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,  {  	char tok[21];	/* U64_MAX */ -	if (!sscanf(buf, "%s %llu", tok, periodp)) +	if (sscanf(buf, "%20s %llu", tok, periodp) < 1)  		return -EINVAL;  	*periodp *= NSEC_PER_USEC; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 835671f0f917..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -7,7 +7,7 @@   */  #include "sched.h" -DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);  /**   * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2efe629425be..962cf343f798 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@  #include <linux/sched/cpufreq.h>  #include <trace/events/power.h> +#define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8) +  struct sugov_tunables {  	struct gov_attr_set	attr_set;  	unsigned int		rate_limit_us; @@ -48,7 +50,6 @@ struct sugov_cpu {  	bool			iowait_boost_pending;  	unsigned int		iowait_boost; -	unsigned int		iowait_boost_max;  	u64			last_update;  	unsigned long		bw_dl; @@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)   *   * The IO wait boost of a task is disabled after a tick since the last update   * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO.   */  static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,  			       bool set_iowait_boost) @@ -303,8 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,  	if (delta_ns <= TICK_NSEC)  		return false; -	sg_cpu->iowait_boost = set_iowait_boost -		? sg_cpu->sg_policy->policy->min : 0; +	sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;  	sg_cpu->iowait_boost_pending = set_iowait_boost;  	return true; @@ -318,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,   *   * Each time a task wakes up after an IO operation, the CPU utilization can be   * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + *   * To keep doubling, an IO boost has to be requested at least once per tick,   * otherwise we restart from the utilization of the minimum OPP.   */ @@ -344,14 +345,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,  	/* Double the boost at each request */  	if (sg_cpu->iowait_boost) { -		sg_cpu->iowait_boost <<= 1; -		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) -			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; +		sg_cpu->iowait_boost = +			min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);  		return;  	}  	/* First wakeup after IO: start with minimum boost */ -	sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; +	sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;  }  /** @@ -373,47 +373,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,   * This mechanism is designed to boost high frequently IO waiting tasks, while   * being more conservative on tasks which does sporadic IO operations.   */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, -			       unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +					unsigned long util, unsigned long max)  { -	unsigned int boost_util, boost_max; +	unsigned long boost;  	/* No boost currently required */  	if (!sg_cpu->iowait_boost) -		return; +		return util;  	/* Reset boost if the CPU appears to have been idle enough */  	if (sugov_iowait_reset(sg_cpu, time, false)) -		return; +		return util; -	/* -	 * An IO waiting task has just woken up: -	 * allow to further double the boost value -	 */ -	if (sg_cpu->iowait_boost_pending) { -		sg_cpu->iowait_boost_pending = false; -	} else { +	if (!sg_cpu->iowait_boost_pending) {  		/* -		 * Otherwise: reduce the boost value and disable it when we -		 * reach the minimum. +		 * No boost pending; reduce the boost value.  		 */  		sg_cpu->iowait_boost >>= 1; -		if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { +		if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {  			sg_cpu->iowait_boost = 0; -			return; +			return util;  		}  	} +	sg_cpu->iowait_boost_pending = false; +  	/* -	 * Apply the current boost value: a CPU is boosted only if its current -	 * utilization is smaller then the current IO boost level. +	 * @util is already in capacity scale; convert iowait_boost +	 * into the same scale so we can compare.  	 */ -	boost_util = sg_cpu->iowait_boost; -	boost_max = sg_cpu->iowait_boost_max; -	if (*util * boost_max < *max * boost_util) { -		*util = boost_util; -		*max = boost_max; -	} +	boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; +	return max(boost, util);  }  #ifdef CONFIG_NO_HZ_COMMON @@ -460,7 +451,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	util = sugov_get_util(sg_cpu);  	max = sg_cpu->max; -	sugov_iowait_apply(sg_cpu, time, &util, &max); +	util = sugov_iowait_apply(sg_cpu, time, util, max);  	next_f = get_next_freq(sg_policy, util, max);  	/*  	 * Do not reduce the frequency if the CPU has not been idle @@ -500,7 +491,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)  		j_util = sugov_get_util(j_sg_cpu);  		j_max = j_sg_cpu->max; -		sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); +		j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);  		if (j_util * max > j_max * util) {  			util = j_util; @@ -609,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count  static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = {  	&rate_limit_us.attr,  	NULL  }; +ATTRIBUTE_GROUPS(sugov);  static struct kobj_type sugov_tunables_ktype = { -	.default_attrs = sugov_attributes, +	.default_groups = sugov_groups,  	.sysfs_ops = &governor_sysfs_ops,  }; @@ -782,6 +774,7 @@ out:  	return 0;  fail: +	kobject_put(&tunables->attr_set.kobj);  	policy->governor_data = NULL;  	sugov_tunables_free(tunables); @@ -837,7 +830,6 @@ static int sugov_start(struct cpufreq_policy *policy)  		memset(sg_cpu, 0, sizeof(*sg_cpu));  		sg_cpu->cpu			= cpu;  		sg_cpu->sg_policy		= sg_policy; -		sg_cpu->iowait_boost_max	= policy->cpuinfo.max_freq;  	}  	for_each_cpu(cpu, policy->cpus) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ba4a143bdcf3..2305ce89a26c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Simple CPU accounting cgroup controller   */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)  	if (dl_entity_is_special(dl_se))  		return; -	WARN_ON(hrtimer_active(&dl_se->inactive_timer));  	WARN_ON(dl_se->dl_non_contending);  	zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)  	 * If the "0-lag time" already passed, decrease the active  	 * utilization now, instead of starting a timer  	 */ -	if (zerolag_time < 0) { +	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {  		if (dl_task(p))  			sub_running_bw(dl_se, dl_rq);  		if (!dl_task(p) || p->state == TASK_DEAD) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8039d62ae36e..678bfb9bd87f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -702,7 +702,7 @@ do {									\  static const char *sched_tunable_scaling_names[] = {  	"none", -	"logaritmic", +	"logarithmic",  	"linear"  }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea74d43924b2..f35930f5e528 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)  	if (p->last_task_numa_placement) {  		delta = runtime - p->last_sum_exec_runtime;  		*period = now - p->last_task_numa_placement; + +		/* Avoid time going backwards, prevent potential divide error: */ +		if (unlikely((s64)*period < 0)) +			*period = 0;  	} else {  		delta = p->se.avg.load_sum;  		*period = LOAD_AVG_MAX; @@ -2593,7 +2597,7 @@ out:  /*   * Drive the periodic memory faults..   */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr)  {  	struct callback_head *work = &curr->numa_work;  	u64 period, now; @@ -3567,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)   * Synchronize entity load avg of dequeued entity without locking   * the previous rq.   */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	u64 last_update_time; @@ -3580,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)   * Task first catches up with cfs_rq, and then subtract   * itself from the cfs_rq (task must be off the queue now).   */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	unsigned long flags; @@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)  	return HRTIMER_NORESTART;  } +extern const u64 max_cfs_quota_period; +  static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)  {  	struct cfs_bandwidth *cfs_b = @@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)  	unsigned long flags;  	int overrun;  	int idle = 0; +	int count = 0;  	raw_spin_lock_irqsave(&cfs_b->lock, flags);  	for (;;) { @@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)  		if (!overrun)  			break; +		if (++count > 3) { +			u64 new, old = ktime_to_ns(cfs_b->period); + +			new = (old * 147) / 128; /* ~115% */ +			new = min(new, max_cfs_quota_period); + +			cfs_b->period = ns_to_ktime(new); + +			/* since max is 1s, this is limited to 1e9^2, which fits in u64 */ +			cfs_b->quota *= new; +			cfs_b->quota = div64_u64(cfs_b->quota, old); + +			pr_warn_ratelimited( +	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", +				smp_processor_id(), +				div_u64(new, NSEC_PER_USEC), +				div_u64(cfs_b->quota, NSEC_PER_USEC)); + +			/* reset count so we don't come right back in here */ +			count = 0; +		} +  		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);  	}  	if (idle) @@ -5116,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)  #ifdef CONFIG_SMP  static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu);  static inline bool cpu_overutilized(int cpu)  { @@ -7492,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)  {  	lockdep_assert_held(&env->src_rq->lock); -	p->on_rq = TASK_ON_RQ_MIGRATING;  	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);  	set_task_cpu(p, env->dst_cpu);  } @@ -7628,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)  	BUG_ON(task_rq(p) != rq);  	activate_task(rq, p, ENQUEUE_NOCLOCK); -	p->on_rq = TASK_ON_RQ_QUEUED;  	check_preempt_curr(rq, p, 0);  } @@ -7784,10 +7810,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)  	if (cfs_rq->last_h_load_update == now)  		return; -	cfs_rq->h_load_next = NULL; +	WRITE_ONCE(cfs_rq->h_load_next, NULL);  	for_each_sched_entity(se) {  		cfs_rq = cfs_rq_of(se); -		cfs_rq->h_load_next = se; +		WRITE_ONCE(cfs_rq->h_load_next, se);  		if (cfs_rq->last_h_load_update == now)  			break;  	} @@ -7797,7 +7823,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)  		cfs_rq->last_h_load_update = now;  	} -	while ((se = cfs_rq->h_load_next) != NULL) { +	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {  		load = cfs_rq->h_load;  		load = div64_ul(load * se->avg.load_avg,  			cfs_rq_load_avg(cfs_rq) + 1); @@ -8060,6 +8086,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)  }  /* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ +	return rq->misfit_task_load && +		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || +		 check_cpu_capacity(rq, sd)); +} + +/*   * Group imbalance indicates (and tries to solve) the problem where balancing   * groups is inadequate due to ->cpus_allowed constraints.   * @@ -9510,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)   * - When one of the busy CPUs notice that there may be an idle rebalancing   *   needed, they will kick the idle load balancer, which then does idle   *   load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + *   anywhere yet.   */  static inline int find_new_ilb(void)  { -	int ilb = cpumask_first(nohz.idle_cpus_mask); +	int ilb; -	if (ilb < nr_cpu_ids && idle_cpu(ilb)) -		return ilb; +	for_each_cpu_and(ilb, nohz.idle_cpus_mask, +			      housekeeping_cpumask(HK_FLAG_MISC)) { +		if (idle_cpu(ilb)) +			return ilb; +	}  	return nr_cpu_ids;  }  /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).   */  static void kick_ilb(unsigned int flags)  { @@ -9586,35 +9628,21 @@ static void nohz_balancer_kick(struct rq *rq)  	if (time_before(now, nohz.next_balance))  		goto out; -	if (rq->nr_running >= 2 || rq->misfit_task_load) { +	if (rq->nr_running >= 2) {  		flags = NOHZ_KICK_MASK;  		goto out;  	}  	rcu_read_lock(); -	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); -	if (sds) { -		/* -		 * If there is an imbalance between LLC domains (IOW we could -		 * increase the overall cache use), we need some less-loaded LLC -		 * domain to pull some load. Likewise, we may need to spread -		 * load within the current LLC domain (e.g. packed SMT cores but -		 * other CPUs are idle). We can't really know from here how busy -		 * the others are - so just get a nohz balance going if it looks -		 * like this LLC domain has tasks we could move. -		 */ -		nr_busy = atomic_read(&sds->nr_busy_cpus); -		if (nr_busy > 1) { -			flags = NOHZ_KICK_MASK; -			goto unlock; -		} - -	}  	sd = rcu_dereference(rq->sd);  	if (sd) { -		if ((rq->cfs.h_nr_running >= 1) && -		    check_cpu_capacity(rq, sd)) { +		/* +		 * If there's a CFS task and the current CPU has reduced +		 * capacity; kick the ILB to see if there's a better CPU to run +		 * on. +		 */ +		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {  			flags = NOHZ_KICK_MASK;  			goto unlock;  		} @@ -9622,6 +9650,11 @@ static void nohz_balancer_kick(struct rq *rq)  	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));  	if (sd) { +		/* +		 * When ASYM_PACKING; see if there's a more preferred CPU +		 * currently idle; in which case, kick the ILB to move tasks +		 * around. +		 */  		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {  			if (sched_asym_prefer(i, cpu)) {  				flags = NOHZ_KICK_MASK; @@ -9629,6 +9662,45 @@ static void nohz_balancer_kick(struct rq *rq)  			}  		}  	} + +	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); +	if (sd) { +		/* +		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU +		 * to run the misfit task on. +		 */ +		if (check_misfit_status(rq, sd)) { +			flags = NOHZ_KICK_MASK; +			goto unlock; +		} + +		/* +		 * For asymmetric systems, we do not want to nicely balance +		 * cache use, instead we want to embrace asymmetry and only +		 * ensure tasks have enough CPU capacity. +		 * +		 * Skip the LLC logic because it's not relevant in that case. +		 */ +		goto unlock; +	} + +	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); +	if (sds) { +		/* +		 * If there is an imbalance between LLC domains (IOW we could +		 * increase the overall cache use), we need some less-loaded LLC +		 * domain to pull some load. Likewise, we may need to spread +		 * load within the current LLC domain (e.g. packed SMT cores but +		 * other CPUs are idle). We can't really know from here how busy +		 * the others are - so just get a nohz balance going if it looks +		 * like this LLC domain has tasks we could move. +		 */ +		nr_busy = atomic_read(&sds->nr_busy_cpus); +		if (nr_busy > 1) { +			flags = NOHZ_KICK_MASK; +			goto unlock; +		} +	}  unlock:  	rcu_read_unlock();  out: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5516bae0c1b..80940939b733 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Generic entry points for the idle threads and   * implementation of the idle task scheduling class. diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b02d148e7672..123ea07a3f3b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  Housekeeping management. Manage the targets for routine code that can run on   *  any CPU: unbound workqueues, timers, kthreads and any offloadable work. @@ -65,6 +66,7 @@ void __init housekeeping_init(void)  static int __init housekeeping_setup(char *str, enum hk_flags flags)  {  	cpumask_var_t non_housekeeping_mask; +	cpumask_var_t tmp;  	int err;  	alloc_bootmem_cpumask_var(&non_housekeeping_mask); @@ -75,16 +77,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)  		return 0;  	} +	alloc_bootmem_cpumask_var(&tmp);  	if (!housekeeping_flags) {  		alloc_bootmem_cpumask_var(&housekeeping_mask);  		cpumask_andnot(housekeeping_mask,  			       cpu_possible_mask, non_housekeeping_mask); -		if (cpumask_empty(housekeeping_mask)) + +		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); +		if (cpumask_empty(tmp)) { +			pr_warn("Housekeeping: must include one present CPU, " +				"using boot CPU:%d\n", smp_processor_id());  			__cpumask_set_cpu(smp_processor_id(), housekeeping_mask); +			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); +		}  	} else { -		cpumask_var_t tmp; - -		alloc_bootmem_cpumask_var(&tmp); +		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); +		if (cpumask_empty(tmp)) +			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);  		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);  		if (!cpumask_equal(tmp, housekeeping_mask)) {  			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); @@ -92,8 +101,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)  			free_bootmem_cpumask_var(non_housekeeping_mask);  			return 0;  		} -		free_bootmem_cpumask_var(tmp);  	} +	free_bootmem_cpumask_var(tmp);  	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {  		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3cd8a3a795d2..aa8d75804108 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>   *   * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details.   */  #include "sched.h" diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@   * Copyright (c) 2018 Facebook, Inc.   * Author: Johannes Weiner <hannes@cmpxchg.org>   * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + *   * When CPU, memory and IO are contended, tasks experience delays that   * reduce throughput and introduce latencies into the workload. Memory   * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@  #include <linux/seq_file.h>  #include <linux/proc_fs.h>  #include <linux/seqlock.h> +#include <linux/uaccess.h>  #include <linux/cgroup.h>  #include <linux/module.h>  #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h>  #include <linux/psi.h>  #include "sched.h" @@ -140,9 +147,9 @@ static int psi_bug __read_mostly;  DEFINE_STATIC_KEY_FALSE(psi_disabled);  #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable;  #else -bool psi_enable = true; +static bool psi_enable = true;  #endif  static int __init setup_psi(char *str)  { @@ -156,16 +163,21 @@ __setup("psi=", setup_psi);  #define EXP_60s		1981		/* 1/exp(2s/60s) */  #define EXP_300s	2034		/* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000	/* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000	/* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10	/* 10 updates per window */ +  /* Sampling frequency in nanoseconds */  static u64 psi_period __read_mostly;  /* System-level pressure and stall tracking */  static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = {  	.pcpu = &system_group_pcpu,  }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work);  static void group_init(struct psi_group *group)  { @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group)  	for_each_possible_cpu(cpu)  		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); -	group->next_update = sched_clock() + psi_period; -	INIT_DELAYED_WORK(&group->clock_work, psi_update_work); -	mutex_init(&group->stat_lock); +	group->avg_next_update = sched_clock() + psi_period; +	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); +	mutex_init(&group->avgs_lock); +	/* Init trigger-related members */ +	atomic_set(&group->poll_scheduled, 0); +	mutex_init(&group->trigger_lock); +	INIT_LIST_HEAD(&group->triggers); +	memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); +	group->poll_states = 0; +	group->poll_min_period = U32_MAX; +	memset(group->polling_total, 0, sizeof(group->polling_total)); +	group->polling_next_update = ULLONG_MAX; +	group->polling_until = 0; +	rcu_assign_pointer(group->poll_kworker, NULL);  }  void __init psi_init(void) @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state)  	}  } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, +			     enum psi_aggregators aggregator, u32 *times, +			     u32 *pchanged_states)  {  	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); -	unsigned int tasks[NR_PSI_TASK_COUNTS];  	u64 now, state_start; +	enum psi_states s;  	unsigned int seq; -	int s; +	u32 state_mask; + +	*pchanged_states = 0;  	/* Snapshot a coherent view of the CPU state */  	do {  		seq = read_seqcount_begin(&groupc->seq);  		now = cpu_clock(cpu);  		memcpy(times, groupc->times, sizeof(groupc->times)); -		memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); +		state_mask = groupc->state_mask;  		state_start = groupc->state_start;  	} while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)  		 * (u32) and our reported pressure close to what's  		 * actually happening.  		 */ -		if (test_state(tasks, s)) +		if (state_mask & (1 << s))  			times[s] += now - state_start; -		delta = times[s] - groupc->times_prev[s]; -		groupc->times_prev[s] = times[s]; +		delta = times[s] - groupc->times_prev[aggregator][s]; +		groupc->times_prev[aggregator][s] = times[s];  		times[s] = delta; +		if (delta) +			*pchanged_states |= (1 << s);  	}  } @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,  	avg[2] = calc_load(avg[2], EXP_300s, pct);  } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, +				 enum psi_aggregators aggregator, +				 u32 *pchanged_states)  {  	u64 deltas[NR_PSI_STATES - 1] = { 0, }; -	unsigned long missed_periods = 0;  	unsigned long nonidle_total = 0; -	u64 now, expires, period; +	u32 changed_states = 0;  	int cpu;  	int s; -	mutex_lock(&group->stat_lock); -  	/*  	 * Collect the per-cpu time buckets and average them into a  	 * single time sample that is normalized to wallclock time. @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group)  	for_each_possible_cpu(cpu) {  		u32 times[NR_PSI_STATES];  		u32 nonidle; +		u32 cpu_changed_states; -		get_recent_times(group, cpu, times); +		get_recent_times(group, cpu, aggregator, times, +				&cpu_changed_states); +		changed_states |= cpu_changed_states;  		nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);  		nonidle_total += nonidle; @@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group)  	/* total= */  	for (s = 0; s < NR_PSI_STATES - 1; s++) -		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); +		group->total[aggregator][s] += +				div_u64(deltas[s], max(nonidle_total, 1UL)); + +	if (pchanged_states) +		*pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ +	unsigned long missed_periods = 0; +	u64 expires, period; +	u64 avg_next_update; +	int s;  	/* avgX= */ -	now = sched_clock(); -	expires = group->next_update; -	if (now < expires) -		goto out; +	expires = group->avg_next_update;  	if (now - expires >= psi_period)  		missed_periods = div_u64(now - expires, psi_period); @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group)  	 * But the deltas we sample out of the per-cpu buckets above  	 * are based on the actual time elapsing between clock ticks.  	 */ -	group->next_update = expires + ((1 + missed_periods) * psi_period); -	period = now - (group->last_update + (missed_periods * psi_period)); -	group->last_update = now; +	avg_next_update = expires + ((1 + missed_periods) * psi_period); +	period = now - (group->avg_last_update + (missed_periods * psi_period)); +	group->avg_last_update = now;  	for (s = 0; s < NR_PSI_STATES - 1; s++) {  		u32 sample; -		sample = group->total[s] - group->total_prev[s]; +		sample = group->total[PSI_AVGS][s] - group->avg_total[s];  		/*  		 * Due to the lockless sampling of the time buckets,  		 * recorded time deltas can slip into the next period, @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group)  		 */  		if (sample > period)  			sample = period; -		group->total_prev[s] += sample; +		group->avg_total[s] += sample;  		calc_avgs(group->avg[s], missed_periods, sample, period);  	} -out: -	mutex_unlock(&group->stat_lock); -	return nonidle_total; + +	return avg_next_update;  } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work)  {  	struct delayed_work *dwork;  	struct psi_group *group; +	u32 changed_states;  	bool nonidle; +	u64 now;  	dwork = to_delayed_work(work); -	group = container_of(dwork, struct psi_group, clock_work); +	group = container_of(dwork, struct psi_group, avgs_work); + +	mutex_lock(&group->avgs_lock); +	now = sched_clock(); + +	collect_percpu_times(group, PSI_AVGS, &changed_states); +	nonidle = changed_states & (1 << PSI_NONIDLE);  	/*  	 * If there is task activity, periodically fold the per-cpu  	 * times and feed samples into the running averages. If things @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work)  	 * Once restarted, we'll catch up the running averages in one  	 * go - see calc_avgs() and missed_periods.  	 */ - -	nonidle = update_stats(group); +	if (now >= group->avg_next_update) +		group->avg_next_update = update_averages(group, now);  	if (nonidle) { -		unsigned long delay = 0; -		u64 now; +		schedule_delayed_work(dwork, nsecs_to_jiffies( +				group->avg_next_update - now) + 1); +	} + +	mutex_unlock(&group->avgs_lock); +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, +			 u64 prev_growth) +{ +	win->start_time = now; +	win->start_value = value; +	win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ +	u64 elapsed; +	u64 growth; + +	elapsed = now - win->start_time; +	growth = value - win->start_value; +	/* +	 * After each tracking window passes win->start_value and +	 * win->start_time get reset and win->prev_growth stores +	 * the average per-window growth of the previous window. +	 * win->prev_growth is then used to interpolate additional +	 * growth from the previous window assuming it was linear. +	 */ +	if (elapsed > win->size) +		window_reset(win, now, value, growth); +	else { +		u32 remaining; + +		remaining = win->size - elapsed; +		growth += div_u64(win->prev_growth * remaining, win->size); +	} + +	return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ +	struct psi_trigger *t; + +	list_for_each_entry(t, &group->triggers, node) +		window_reset(&t->win, now, +				group->total[PSI_POLL][t->state], 0); +	memcpy(group->polling_total, group->total[PSI_POLL], +		   sizeof(group->polling_total)); +	group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ +	struct psi_trigger *t; +	bool new_stall = false; +	u64 *total = group->total[PSI_POLL]; + +	/* +	 * On subsequent updates, calculate growth deltas and let +	 * watchers know when their specified thresholds are exceeded. +	 */ +	list_for_each_entry(t, &group->triggers, node) { +		u64 growth; + +		/* Check for stall activity */ +		if (group->polling_total[t->state] == total[t->state]) +			continue; + +		/* +		 * Multiple triggers might be looking at the same state, +		 * remember to update group->polling_total[] once we've +		 * been through all of them. Also remember to extend the +		 * polling time if we see new stall activity. +		 */ +		new_stall = true; + +		/* Calculate growth since last update */ +		growth = window_update(&t->win, now, total[t->state]); +		if (growth < t->threshold) +			continue; + +		/* Limit event signaling to once per window */ +		if (now < t->last_event_time + t->win.size) +			continue; + +		/* Generate an event */ +		if (cmpxchg(&t->event, 0, 1) == 0) +			wake_up_interruptible(&t->event_wait); +		t->last_event_time = now; +	} + +	if (new_stall) +		memcpy(group->polling_total, total, +				sizeof(group->polling_total)); + +	return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ +	struct kthread_worker *kworker; + +	/* Do not reschedule if already scheduled */ +	if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) +		return; + +	rcu_read_lock(); -		now = sched_clock(); -		if (group->next_update > now) -			delay = nsecs_to_jiffies(group->next_update - now) + 1; -		schedule_delayed_work(dwork, delay); +	kworker = rcu_dereference(group->poll_kworker); +	/* +	 * kworker might be NULL in case psi_trigger_destroy races with +	 * psi_task_change (hotpath) which can't use locks +	 */ +	if (likely(kworker)) +		kthread_queue_delayed_work(kworker, &group->poll_work, delay); +	else +		atomic_set(&group->poll_scheduled, 0); + +	rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ +	struct kthread_delayed_work *dwork; +	struct psi_group *group; +	u32 changed_states; +	u64 now; + +	dwork = container_of(work, struct kthread_delayed_work, work); +	group = container_of(dwork, struct psi_group, poll_work); + +	atomic_set(&group->poll_scheduled, 0); + +	mutex_lock(&group->trigger_lock); + +	now = sched_clock(); + +	collect_percpu_times(group, PSI_POLL, &changed_states); + +	if (changed_states & group->poll_states) { +		/* Initialize trigger windows when entering polling mode */ +		if (now > group->polling_until) +			init_triggers(group, now); + +		/* +		 * Keep the monitor active for at least the duration of the +		 * minimum tracking window as long as monitor states are +		 * changing. +		 */ +		group->polling_until = now + +			group->poll_min_period * UPDATES_PER_WINDOW; +	} + +	if (now > group->polling_until) { +		group->polling_next_update = ULLONG_MAX; +		goto out;  	} + +	if (now >= group->polling_next_update) +		group->polling_next_update = update_triggers(group, now); + +	psi_schedule_poll_work(group, +		nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: +	mutex_unlock(&group->trigger_lock);  }  static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,  	delta = now - groupc->state_start;  	groupc->state_start = now; -	if (test_state(groupc->tasks, PSI_IO_SOME)) { +	if (groupc->state_mask & (1 << PSI_IO_SOME)) {  		groupc->times[PSI_IO_SOME] += delta; -		if (test_state(groupc->tasks, PSI_IO_FULL)) +		if (groupc->state_mask & (1 << PSI_IO_FULL))  			groupc->times[PSI_IO_FULL] += delta;  	} -	if (test_state(groupc->tasks, PSI_MEM_SOME)) { +	if (groupc->state_mask & (1 << PSI_MEM_SOME)) {  		groupc->times[PSI_MEM_SOME] += delta; -		if (test_state(groupc->tasks, PSI_MEM_FULL)) +		if (groupc->state_mask & (1 << PSI_MEM_FULL))  			groupc->times[PSI_MEM_FULL] += delta;  		else if (memstall_tick) {  			u32 sample; @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,  		}  	} -	if (test_state(groupc->tasks, PSI_CPU_SOME)) +	if (groupc->state_mask & (1 << PSI_CPU_SOME))  		groupc->times[PSI_CPU_SOME] += delta; -	if (test_state(groupc->tasks, PSI_NONIDLE)) +	if (groupc->state_mask & (1 << PSI_NONIDLE))  		groupc->times[PSI_NONIDLE] += delta;  } -static void psi_group_change(struct psi_group *group, int cpu, -			     unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, +			    unsigned int clear, unsigned int set)  {  	struct psi_group_cpu *groupc;  	unsigned int t, m; +	enum psi_states s; +	u32 state_mask = 0;  	groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu,  		if (set & (1 << t))  			groupc->tasks[t]++; +	/* Calculate state mask representing active states */ +	for (s = 0; s < NR_PSI_STATES; s++) { +		if (test_state(groupc->tasks, s)) +			state_mask |= (1 << s); +	} +	groupc->state_mask = state_mask; +  	write_seqcount_end(&groupc->seq); + +	return state_mask;  }  static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)  	 */  	if (unlikely((clear & TSK_RUNNING) &&  		     (task->flags & PF_WQ_WORKER) && -		     wq_worker_last_func(task) == psi_update_work)) +		     wq_worker_last_func(task) == psi_avgs_work))  		wake_clock = false;  	while ((group = iterate_groups(task, &iter))) { -		psi_group_change(group, cpu, clear, set); -		if (wake_clock && !delayed_work_pending(&group->clock_work)) -			schedule_delayed_work(&group->clock_work, PSI_FREQ); +		u32 state_mask = psi_group_change(group, cpu, clear, set); + +		if (state_mask & group->poll_states) +			psi_schedule_poll_work(group, 1); + +		if (wake_clock && !delayed_work_pending(&group->avgs_work)) +			schedule_delayed_work(&group->avgs_work, PSI_FREQ);  	}  } @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup)  	if (static_branch_likely(&psi_disabled))  		return; -	cancel_delayed_work_sync(&cgroup->psi.clock_work); +	cancel_delayed_work_sync(&cgroup->psi.avgs_work);  	free_percpu(cgroup->psi.pcpu); +	/* All triggers must be removed by now */ +	WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");  }  /** @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)  int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)  {  	int full; +	u64 now;  	if (static_branch_likely(&psi_disabled))  		return -EOPNOTSUPP; -	update_stats(group); +	/* Update averages before reporting them */ +	mutex_lock(&group->avgs_lock); +	now = sched_clock(); +	collect_percpu_times(group, PSI_AVGS, NULL); +	if (now >= group->avg_next_update) +		group->avg_next_update = update_averages(group, now); +	mutex_unlock(&group->avgs_lock);  	for (full = 0; full < 2 - (res == PSI_CPU); full++) {  		unsigned long avg[3]; @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)  		for (w = 0; w < 3; w++)  			avg[w] = group->avg[res * 2 + full][w]; -		total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); +		total = div_u64(group->total[PSI_AVGS][res * 2 + full], +				NSEC_PER_USEC);  		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",  			   full ? "full" : "some", @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)  	return single_open(file, psi_cpu_show, NULL);  } +struct psi_trigger *psi_trigger_create(struct psi_group *group, +			char *buf, size_t nbytes, enum psi_res res) +{ +	struct psi_trigger *t; +	enum psi_states state; +	u32 threshold_us; +	u32 window_us; + +	if (static_branch_likely(&psi_disabled)) +		return ERR_PTR(-EOPNOTSUPP); + +	if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) +		state = PSI_IO_SOME + res * 2; +	else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) +		state = PSI_IO_FULL + res * 2; +	else +		return ERR_PTR(-EINVAL); + +	if (state >= PSI_NONIDLE) +		return ERR_PTR(-EINVAL); + +	if (window_us < WINDOW_MIN_US || +		window_us > WINDOW_MAX_US) +		return ERR_PTR(-EINVAL); + +	/* Check threshold */ +	if (threshold_us == 0 || threshold_us > window_us) +		return ERR_PTR(-EINVAL); + +	t = kmalloc(sizeof(*t), GFP_KERNEL); +	if (!t) +		return ERR_PTR(-ENOMEM); + +	t->group = group; +	t->state = state; +	t->threshold = threshold_us * NSEC_PER_USEC; +	t->win.size = window_us * NSEC_PER_USEC; +	window_reset(&t->win, 0, 0, 0); + +	t->event = 0; +	t->last_event_time = 0; +	init_waitqueue_head(&t->event_wait); +	kref_init(&t->refcount); + +	mutex_lock(&group->trigger_lock); + +	if (!rcu_access_pointer(group->poll_kworker)) { +		struct sched_param param = { +			.sched_priority = MAX_RT_PRIO - 1, +		}; +		struct kthread_worker *kworker; + +		kworker = kthread_create_worker(0, "psimon"); +		if (IS_ERR(kworker)) { +			kfree(t); +			mutex_unlock(&group->trigger_lock); +			return ERR_CAST(kworker); +		} +		sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); +		kthread_init_delayed_work(&group->poll_work, +				psi_poll_work); +		rcu_assign_pointer(group->poll_kworker, kworker); +	} + +	list_add(&t->node, &group->triggers); +	group->poll_min_period = min(group->poll_min_period, +		div_u64(t->win.size, UPDATES_PER_WINDOW)); +	group->nr_triggers[t->state]++; +	group->poll_states |= (1 << t->state); + +	mutex_unlock(&group->trigger_lock); + +	return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ +	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); +	struct psi_group *group = t->group; +	struct kthread_worker *kworker_to_destroy = NULL; + +	if (static_branch_likely(&psi_disabled)) +		return; + +	/* +	 * Wakeup waiters to stop polling. Can happen if cgroup is deleted +	 * from under a polling process. +	 */ +	wake_up_interruptible(&t->event_wait); + +	mutex_lock(&group->trigger_lock); + +	if (!list_empty(&t->node)) { +		struct psi_trigger *tmp; +		u64 period = ULLONG_MAX; + +		list_del(&t->node); +		group->nr_triggers[t->state]--; +		if (!group->nr_triggers[t->state]) +			group->poll_states &= ~(1 << t->state); +		/* reset min update period for the remaining triggers */ +		list_for_each_entry(tmp, &group->triggers, node) +			period = min(period, div_u64(tmp->win.size, +					UPDATES_PER_WINDOW)); +		group->poll_min_period = period; +		/* Destroy poll_kworker when the last trigger is destroyed */ +		if (group->poll_states == 0) { +			group->polling_until = 0; +			kworker_to_destroy = rcu_dereference_protected( +					group->poll_kworker, +					lockdep_is_held(&group->trigger_lock)); +			rcu_assign_pointer(group->poll_kworker, NULL); +		} +	} + +	mutex_unlock(&group->trigger_lock); + +	/* +	 * Wait for both *trigger_ptr from psi_trigger_replace and +	 * poll_kworker RCUs to complete their read-side critical sections +	 * before destroying the trigger and optionally the poll_kworker +	 */ +	synchronize_rcu(); +	/* +	 * Destroy the kworker after releasing trigger_lock to prevent a +	 * deadlock while waiting for psi_poll_work to acquire trigger_lock +	 */ +	if (kworker_to_destroy) { +		kthread_cancel_delayed_work_sync(&group->poll_work); +		kthread_destroy_worker(kworker_to_destroy); +	} +	kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ +	struct psi_trigger *old = *trigger_ptr; + +	if (static_branch_likely(&psi_disabled)) +		return; + +	rcu_assign_pointer(*trigger_ptr, new); +	if (old) +		kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, +				struct file *file, poll_table *wait) +{ +	__poll_t ret = DEFAULT_POLLMASK; +	struct psi_trigger *t; + +	if (static_branch_likely(&psi_disabled)) +		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + +	rcu_read_lock(); + +	t = rcu_dereference(*(void __rcu __force **)trigger_ptr); +	if (!t) { +		rcu_read_unlock(); +		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; +	} +	kref_get(&t->refcount); + +	rcu_read_unlock(); + +	poll_wait(file, &t->event_wait, wait); + +	if (cmpxchg(&t->event, 1, 0) == 1) +		ret |= EPOLLPRI; + +	kref_put(&t->refcount, psi_trigger_destroy); + +	return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, +			 size_t nbytes, enum psi_res res) +{ +	char buf[32]; +	size_t buf_size; +	struct seq_file *seq; +	struct psi_trigger *new; + +	if (static_branch_likely(&psi_disabled)) +		return -EOPNOTSUPP; + +	buf_size = min(nbytes, (sizeof(buf) - 1)); +	if (copy_from_user(buf, user_buf, buf_size)) +		return -EFAULT; + +	buf[buf_size - 1] = '\0'; + +	new = psi_trigger_create(&psi_system, buf, nbytes, res); +	if (IS_ERR(new)) +		return PTR_ERR(new); + +	seq = file->private_data; +	/* Take seq->lock to protect seq->private from concurrent writes */ +	mutex_lock(&seq->lock); +	psi_trigger_replace(&seq->private, new); +	mutex_unlock(&seq->lock); + +	return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, +			    size_t nbytes, loff_t *ppos) +{ +	return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, +				size_t nbytes, loff_t *ppos) +{ +	return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, +			     size_t nbytes, loff_t *ppos) +{ +	return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ +	struct seq_file *seq = file->private_data; + +	return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ +	struct seq_file *seq = file->private_data; + +	psi_trigger_replace(&seq->private, NULL); +	return single_release(inode, file); +} +  static const struct file_operations psi_io_fops = {  	.open           = psi_io_open,  	.read           = seq_read,  	.llseek         = seq_lseek, -	.release        = single_release, +	.write          = psi_io_write, +	.poll           = psi_fop_poll, +	.release        = psi_fop_release,  };  static const struct file_operations psi_memory_fops = {  	.open           = psi_memory_open,  	.read           = seq_read,  	.llseek         = seq_lseek, -	.release        = single_release, +	.write          = psi_memory_write, +	.poll           = psi_fop_poll, +	.release        = psi_fop_release,  };  static const struct file_operations psi_cpu_fops = {  	.open           = psi_cpu_open,  	.read           = seq_read,  	.llseek         = seq_lseek, -	.release        = single_release, +	.write          = psi_cpu_write, +	.poll           = psi_fop_poll, +	.release        = psi_fop_release,  };  static int __init psi_proc_init(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 90fa23d36565..1e6b909dca36 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)  	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;  	if (rt_runtime_us < 0)  		rt_runtime = RUNTIME_INF; +	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) +		return -EINVAL;  	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);  } @@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)  {  	u64 rt_runtime, rt_period; +	if (rt_period_us > U64_MAX / NSEC_PER_USEC) +		return -EINVAL; +  	rt_period = rt_period_us * NSEC_PER_USEC;  	rt_runtime = tg->rt_bandwidth.rt_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efa686eeff26..b52ed1ada0be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -780,7 +780,7 @@ struct root_domain {  	 * NULL-terminated list of performance domains intersecting with the  	 * CPUs of the rd. Protected by RCU.  	 */ -	struct perf_domain	*pd; +	struct perf_domain __rcu *pd;  };  extern struct root_domain def_root_domain; @@ -869,8 +869,8 @@ struct rq {  	atomic_t		nr_iowait;  #ifdef CONFIG_SMP -	struct root_domain	*rd; -	struct sched_domain	*sd; +	struct root_domain		*rd; +	struct sched_domain __rcu	*sd;  	unsigned long		cpu_capacity;  	unsigned long		cpu_capacity_orig; @@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)  	return sd;  } -DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_size);  DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);  extern struct static_key_false sched_asym_cpucapacity;  struct sched_group_capacity { @@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */  #ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);  /**   * cpufreq_update_util - Take a note about CPU utilization changes. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ab7f371a3a17..f53f89df837d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)   * the cpumask of the domain), this allows us to quickly tell if   * two CPUs are in the same cache domain, see cpus_share_cache().   */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_size);  DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);  static void update_top_cache_domain(int cpu) @@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)  	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);  	struct sched_domain *child = sd->child;  	struct sched_group *sg; +	bool already_visited;  	if (child)  		cpu = cpumask_first(sched_domain_span(child)); @@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)  	sg = *per_cpu_ptr(sdd->sg, cpu);  	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); -	/* For claim_allocations: */ -	atomic_inc(&sg->ref); -	atomic_inc(&sg->sgc->ref); +	/* Increase refcounts for claim_allocations: */ +	already_visited = atomic_inc_return(&sg->ref) > 1; +	/* sgc visits should follow a similar trend as sg */ +	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); + +	/* If we have already visited that group, it's already initialized. */ +	if (already_visited) +		return sg;  	if (child) {  		cpumask_copy(sched_group_span(sg), sched_domain_span(child)); @@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)  /*   * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. + * covered by the given span, will set each group's ->cpumask correctly, + * and will initialize their ->sgc.   *   * Assumes the sched_domain tree is fully constructed   */ @@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)  }  /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated CPUs, but could be used to - * exclude other special cases in the future. + * Set up scheduler domains and groups.  For now this just excludes isolated + * CPUs, but could be used to exclude other special cases in the future.   */  int sched_init_domains(const struct cpumask *cpu_map)  { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6eb1f8efd221..fa0f9adfb752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Generic waiting primitives.   * diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c67c6d24adc2..45eba18a2898 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * The implementation of the wait_bit*() and related waiting APIs:   */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..811b4a86cdf6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,8 +148,8 @@ static void populate_seccomp_data(struct seccomp_data *sd)  	unsigned long args[6];  	sd->nr = syscall_get_nr(task, regs); -	sd->arch = syscall_get_arch(); -	syscall_get_arguments(task, regs, 0, 6, args); +	sd->arch = syscall_get_arch(task); +	syscall_get_arguments(task, regs, args);  	sd->args[0] = args[0];  	sd->args[1] = args[1];  	sd->args[2] = args[2]; @@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent,   * Expects sighand and cred_guard_mutex locks to be held.   *   * Returns 0 on success, -ve on error, or the pid of a thread which was - * either not in the correct seccomp mode or it did not have an ancestral + * either not in the correct seccomp mode or did not have an ancestral   * seccomp filter.   */  static inline pid_t seccomp_can_sync_threads(void) @@ -502,7 +502,10 @@ out:   *   * Caller must be holding current->sighand->siglock lock.   * - * Returns 0 on success, -ve on error. + * Returns 0 on success, -ve on error, or + *   - in TSYNC mode: the pid of a thread which was either not in the correct + *     seccomp mode or did not have an ancestral seccomp filter + *   - in NEW_LISTENER mode: the fd of the new listener   */  static long seccomp_attach_filter(unsigned int flags,  				  struct seccomp_filter *filter) @@ -591,7 +594,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason  	info->si_code = SYS_SECCOMP;  	info->si_call_addr = (void __user *)KSTK_EIP(current);  	info->si_errno = reason; -	info->si_arch = syscall_get_arch(); +	info->si_arch = syscall_get_arch(current);  	info->si_syscall = syscall;  } @@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags,  	if (flags & ~SECCOMP_FILTER_FLAG_MASK)  		return -EINVAL; +	/* +	 * In the successful case, NEW_LISTENER returns the new listener fd. +	 * But in the failure case, TSYNC returns the thread that died. If you +	 * combine these two flags, there's no way to tell whether something +	 * succeeded or failed. So, let's disallow this combination. +	 */ +	if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && +	    (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) +		return -EINVAL; +  	/* Prepare the new filter before holding any locks. */  	prepared = seccomp_prepare_user_filter(filter);  	if (IS_ERR(prepared)) @@ -1304,7 +1317,7 @@ out:  		mutex_unlock(¤t->signal->cred_guard_mutex);  out_put_fd:  	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { -		if (ret < 0) { +		if (ret) {  			listener_f->private_data = NULL;  			fput(listener_f);  			put_unused_fd(listener); diff --git a/kernel/signal.c b/kernel/signal.c index b7953934aa99..d622eac9d169 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   *  linux/kernel/signal.c   * @@ -43,6 +44,7 @@  #include <linux/compiler.h>  #include <linux/posix-timers.h>  #include <linux/livepatch.h> +#include <linux/cgroup.h>  #define CREATE_TRACE_POINTS  #include <trace/events/signal.h> @@ -146,9 +148,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)  static bool recalc_sigpending_tsk(struct task_struct *t)  { -	if ((t->jobctl & JOBCTL_PENDING_MASK) || +	if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||  	    PENDING(&t->pending, &t->blocked) || -	    PENDING(&t->signal->shared_pending, &t->blocked)) { +	    PENDING(&t->signal->shared_pending, &t->blocked) || +	    cgroup_task_frozen(t)) {  		set_tsk_thread_flag(t, TIF_SIGPENDING);  		return true;  	} @@ -838,6 +841,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,  			 */  			if (!sid || sid == task_session(current))  				break; +			/* fall through */  		default:  			return -EPERM;  		} @@ -2108,7 +2112,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t  		preempt_disable();  		read_unlock(&tasklist_lock);  		preempt_enable_no_resched(); +		cgroup_enter_frozen();  		freezable_schedule(); +		cgroup_leave_frozen(true);  	} else {  		/*  		 * By the time we got the lock, our tracer went away. @@ -2286,6 +2292,7 @@ static bool do_signal_stop(int signr)  		}  		/* Now we don't run again until woken by SIGCONT or SIGKILL */ +		cgroup_enter_frozen();  		freezable_schedule();  		return true;  	} else { @@ -2332,6 +2339,43 @@ static void do_jobctl_trap(void)  	}  } +/** + * do_freezer_trap - handle the freezer jobctl trap + * + * Puts the task into frozen state, if only the task is not about to quit. + * In this case it drops JOBCTL_TRAP_FREEZE. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, + * which is always released before returning. + */ +static void do_freezer_trap(void) +	__releases(¤t->sighand->siglock) +{ +	/* +	 * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, +	 * let's make another loop to give it a chance to be handled. +	 * In any case, we'll return back. +	 */ +	if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != +	     JOBCTL_TRAP_FREEZE) { +		spin_unlock_irq(¤t->sighand->siglock); +		return; +	} + +	/* +	 * Now we're sure that there is no pending fatal signal and no +	 * pending traps. Clear TIF_SIGPENDING to not get out of schedule() +	 * immediately (if there is a non-fatal signal pending), and +	 * put the task into sleep. +	 */ +	__set_current_state(TASK_INTERRUPTIBLE); +	clear_thread_flag(TIF_SIGPENDING); +	spin_unlock_irq(¤t->sighand->siglock); +	cgroup_enter_frozen(); +	freezable_schedule(); +} +  static int ptrace_signal(int signr, kernel_siginfo_t *info)  {  	/* @@ -2441,6 +2485,8 @@ relock:  	if (signal_group_exit(signal)) {  		ksig->info.si_signo = signr = SIGKILL;  		sigdelset(¤t->pending.signal, SIGKILL); +		trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, +				&sighand->action[SIGKILL - 1]);  		recalc_sigpending();  		goto fatal;  	} @@ -2452,9 +2498,24 @@ relock:  		    do_signal_stop(0))  			goto relock; -		if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { -			do_jobctl_trap(); +		if (unlikely(current->jobctl & +			     (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { +			if (current->jobctl & JOBCTL_TRAP_MASK) { +				do_jobctl_trap(); +				spin_unlock_irq(&sighand->siglock); +			} else if (current->jobctl & JOBCTL_TRAP_FREEZE) +				do_freezer_trap(); + +			goto relock; +		} + +		/* +		 * If the task is leaving the frozen state, let's update +		 * cgroup counters and reset the frozen bit. +		 */ +		if (unlikely(cgroup_task_frozen(current))) {  			spin_unlock_irq(&sighand->siglock); +			cgroup_leave_frozen(false);  			goto relock;  		} @@ -2550,6 +2611,8 @@ relock:  	fatal:  		spin_unlock_irq(&sighand->siglock); +		if (unlikely(cgroup_task_frozen(current))) +			cgroup_leave_frozen(true);  		/*  		 * Anything else is fatal, maybe with a core dump. @@ -3513,7 +3576,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)  	return kill_something_info(sig, &info, pid);  } -#ifdef CONFIG_PROC_FS  /*   * Verify that the signaler and signalee either are in the same pid namespace   * or that the signaler's pid namespace is an ancestor of the signalee's pid @@ -3550,13 +3612,20 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)  	return copy_siginfo_from_user(kinfo, info);  } +static struct pid *pidfd_to_pid(const struct file *file) +{ +	if (file->f_op == &pidfd_fops) +		return file->private_data; + +	return tgid_pidfd_to_pid(file); +} +  /** - * sys_pidfd_send_signal - send a signal to a process through a task file - *                          descriptor - * @pidfd:  the file descriptor of the process - * @sig:    signal to be sent - * @info:   the signal info - * @flags:  future flags to be passed + * sys_pidfd_send_signal - Signal a process through a pidfd + * @pidfd:  file descriptor of the process + * @sig:    signal to send + * @info:   signal info + * @flags:  future flags   *   * The syscall currently only signals via PIDTYPE_PID which covers   * kill(<positive-pid>, <signal>. It does not signal threads or process @@ -3581,12 +3650,12 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,  	if (flags)  		return -EINVAL; -	f = fdget_raw(pidfd); +	f = fdget(pidfd);  	if (!f.file)  		return -EBADF;  	/* Is this a pidfd? */ -	pid = tgid_pidfd_to_pid(f.file); +	pid = pidfd_to_pid(f.file);  	if (IS_ERR(pid)) {  		ret = PTR_ERR(pid);  		goto err; @@ -3605,16 +3674,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,  		if (unlikely(sig != kinfo.si_signo))  			goto err; +		/* Only allow sending arbitrary signals to yourself. */ +		ret = -EPERM;  		if ((task_pid(current) != pid) && -		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { -			/* Only allow sending arbitrary signals to yourself. */ -			ret = -EPERM; -			if (kinfo.si_code != SI_USER) -				goto err; - -			/* Turn this into a regular kill signal. */ -			prepare_kill_siginfo(sig, &kinfo); -		} +		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) +			goto err;  	} else {  		prepare_kill_siginfo(sig, &kinfo);  	} @@ -3625,7 +3689,6 @@ err:  	fdput(f);  	return ret;  } -#endif /* CONFIG_PROC_FS */  static int  do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) diff --git a/kernel/smp.c b/kernel/smp.c index f4cf1b0bb3b8..d155374632eb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Generic helpers for smp ipi calls   * diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c230c2dd48e1..2efe1e206167 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Common SMP CPU bringup/teardown functions   */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 10277429ed84..2c3382378d94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t)  }  EXPORT_SYMBOL(tasklet_kill); -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ -	struct tasklet_hrtimer *ttimer = -		container_of(timer, struct tasklet_hrtimer, timer); - -	tasklet_hi_schedule(&ttimer->tasklet); -	return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ -	struct tasklet_hrtimer *ttimer = (void *)data; -	enum hrtimer_restart restart; - -	restart = ttimer->function(&ttimer->timer); -	if (restart != HRTIMER_NORESTART) -		hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer:	 tasklet_hrtimer which is initialized - * @function:	 hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode:	 hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, -			  enum hrtimer_restart (*function)(struct hrtimer *), -			  clockid_t which_clock, enum hrtimer_mode mode) -{ -	hrtimer_init(&ttimer->timer, which_clock, mode); -	ttimer->timer.function = __hrtimer_tasklet_trampoline; -	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, -		     (unsigned long)ttimer); -	ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -  void __init softirq_init(void)  {  	int cpu; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f8edee9c792d..36139de0a3c4 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kernel/stacktrace.c   * @@ -5,41 +6,56 @@   *   *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>   */ +#include <linux/sched/task_stack.h> +#include <linux/sched/debug.h>  #include <linux/sched.h>  #include <linux/kernel.h>  #include <linux/export.h>  #include <linux/kallsyms.h>  #include <linux/stacktrace.h> -void print_stack_trace(struct stack_trace *trace, int spaces) +/** + * stack_trace_print - Print the entries in the stack trace + * @entries:	Pointer to storage array + * @nr_entries:	Number of entries in the storage array + * @spaces:	Number of leading spaces to print + */ +void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +		       int spaces)  { -	int i; +	unsigned int i; -	if (WARN_ON(!trace->entries)) +	if (WARN_ON(!entries))  		return; -	for (i = 0; i < trace->nr_entries; i++) -		printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); +	for (i = 0; i < nr_entries; i++) +		printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);  } -EXPORT_SYMBOL_GPL(print_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_print); -int snprint_stack_trace(char *buf, size_t size, -			struct stack_trace *trace, int spaces) +/** + * stack_trace_snprint - Print the entries in the stack trace into a buffer + * @buf:	Pointer to the print buffer + * @size:	Size of the print buffer + * @entries:	Pointer to storage array + * @nr_entries:	Number of entries in the storage array + * @spaces:	Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +			unsigned int nr_entries, int spaces)  { -	int i; -	int generated; -	int total = 0; +	unsigned int generated, i, total = 0; -	if (WARN_ON(!trace->entries)) +	if (WARN_ON(!entries))  		return 0; -	for (i = 0; i < trace->nr_entries; i++) { +	for (i = 0; i < nr_entries && size; i++) {  		generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', -				     (void *)trace->entries[i]); +				     (void *)entries[i]);  		total += generated; - -		/* Assume that generated isn't a negative number */  		if (generated >= size) {  			buf += size;  			size = 0; @@ -51,7 +67,176 @@ int snprint_stack_trace(char *buf, size_t size,  	return total;  } -EXPORT_SYMBOL_GPL(snprint_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_snprint); + +#ifdef CONFIG_ARCH_STACKWALK + +struct stacktrace_cookie { +	unsigned long	*store; +	unsigned int	size; +	unsigned int	skip; +	unsigned int	len; +}; + +static bool stack_trace_consume_entry(void *cookie, unsigned long addr, +				      bool reliable) +{ +	struct stacktrace_cookie *c = cookie; + +	if (c->len >= c->size) +		return false; + +	if (c->skip > 0) { +		c->skip--; +		return true; +	} +	c->store[c->len++] = addr; +	return c->len < c->size; +} + +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, +					      bool reliable) +{ +	if (in_sched_functions(addr)) +		return true; +	return stack_trace_consume_entry(cookie, addr, reliable); +} + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, +			      unsigned int skipnr) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +		.skip	= skipnr + 1, +	}; + +	arch_stack_walk(consume_entry, &c, current, NULL); +	return c.len; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task:	The task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, +				  unsigned int size, unsigned int skipnr) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +		.skip	= skipnr + 1, +	}; + +	if (!try_get_task_stack(tsk)) +		return 0; + +	arch_stack_walk(consume_entry, &c, tsk, NULL); +	put_task_stack(tsk); +	return c.len; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs:	Pointer to pt_regs to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, +				   unsigned int size, unsigned int skipnr) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +		.skip	= skipnr, +	}; + +	arch_stack_walk(consume_entry, &c, current, regs); +	return c.len; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk:	Pointer to the task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return:	An error if it detects any unreliable features of the + *		stack. Otherwise it guarantees that the stack trace is + *		reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, +				  unsigned int size) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +	}; +	int ret; + +	/* +	 * If the task doesn't have a stack (e.g., a zombie), the stack is +	 * "reliably" empty. +	 */ +	if (!try_get_task_stack(tsk)) +		return 0; + +	ret = arch_stack_walk_reliable(consume_entry, &c, tsk); +	put_task_stack(tsk); +	return ret ? ret : c.len; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +	}; + +	/* Trace user stack if not a kernel thread */ +	if (!current->mm) +		return 0; + +	arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); +	return c.len; +} +#endif + +#else /* CONFIG_ARCH_STACKWALK */  /*   * Architectures that do not implement save_stack_trace_*() @@ -77,3 +262,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,  	WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");  	return -ENOSYS;  } + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, +			      unsigned int skipnr) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +		.skip		= skipnr + 1, +	}; + +	save_stack_trace(&trace); +	return trace.nr_entries; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task:	The task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_tsk(struct task_struct *task, +				  unsigned long *store, unsigned int size, +				  unsigned int skipnr) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +		.skip		= skipnr + 1, +	}; + +	save_stack_trace_tsk(task, &trace); +	return trace.nr_entries; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs:	Pointer to pt_regs to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, +				   unsigned int size, unsigned int skipnr) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +		.skip		= skipnr, +	}; + +	save_stack_trace_regs(regs, &trace); +	return trace.nr_entries; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk:	Pointer to the task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return:	An error if it detects any unreliable features of the + *		stack. Otherwise it guarantees that the stack trace is + *		reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, +				  unsigned int size) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +	}; +	int ret = save_stack_trace_tsk_reliable(tsk, &trace); + +	return ret ? ret : trace.nr_entries; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +	}; + +	save_stack_trace_user(&trace); +	return trace.nr_entries; +} +#endif /* CONFIG_USER_STACKTRACE_SUPPORT */ + +#endif /* !CONFIG_ARCH_STACKWALK */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..2b5a6754646f 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * kernel/stop_machine.c   * @@ -5,8 +6,6 @@   * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au   * Copyright (C) 2010		SUSE Linux Products GmbH   * Copyright (C) 2010		Tejun Heo <tj@kernel.org> - * - * This file is released under the GPLv2 and any later version.   */  #include <linux/completion.h>  #include <linux/cpu.h> @@ -513,7 +512,7 @@ repeat:  		}  		preempt_count_dec();  		WARN_ONCE(preempt_count(), -			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); +			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);  		goto repeat;  	}  } diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..2969304c29fe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1882,13 +1882,14 @@ exit_err:  }  /* + * Check arithmetic relations of passed addresses. + *   * WARNING: we don't require any capability here so be very careful   * in what is allowed for modification from userspace.   */ -static int validate_prctl_map(struct prctl_mm_map *prctl_map) +static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)  {  	unsigned long mmap_max_addr = TASK_SIZE; -	struct mm_struct *mm = current->mm;  	int error = -EINVAL, i;  	static const unsigned char offsets[] = { @@ -1924,7 +1925,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)  	((unsigned long)prctl_map->__m1 __op				\  	 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL  	error  = __prctl_check_order(start_code, <, end_code); -	error |= __prctl_check_order(start_data, <, end_data); +	error |= __prctl_check_order(start_data,<=, end_data);  	error |= __prctl_check_order(start_brk, <=, brk);  	error |= __prctl_check_order(arg_start, <=, arg_end);  	error |= __prctl_check_order(env_start, <=, env_end); @@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)  			      prctl_map->start_data))  			goto out; -	/* -	 * Someone is trying to cheat the auxv vector. -	 */ -	if (prctl_map->auxv_size) { -		if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) -			goto out; -	} - -	/* -	 * Finally, make sure the caller has the rights to -	 * change /proc/pid/exe link: only local sys admin should -	 * be allowed to. -	 */ -	if (prctl_map->exe_fd != (u32)-1) { -		if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) -			goto out; -	} -  	error = 0;  out:  	return error; @@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data  	if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))  		return -EFAULT; -	error = validate_prctl_map(&prctl_map); +	error = validate_prctl_map_addr(&prctl_map);  	if (error)  		return error;  	if (prctl_map.auxv_size) { +		/* +		 * Someone is trying to cheat the auxv vector. +		 */ +		if (!prctl_map.auxv || +				prctl_map.auxv_size > sizeof(mm->saved_auxv)) +			return -EINVAL; +  		memset(user_auxv, 0, sizeof(user_auxv));  		if (copy_from_user(user_auxv,  				   (const void __user *)prctl_map.auxv, @@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data  	}  	if (prctl_map.exe_fd != (u32)-1) { +		/* +		 * Make sure the caller has the rights to +		 * change /proc/pid/exe link: only local sys admin should +		 * be allowed to. +		 */ +		if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) +			return -EINVAL; +  		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);  		if (error)  			return error; @@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr,  			unsigned long arg4, unsigned long arg5)  {  	struct mm_struct *mm = current->mm; -	struct prctl_mm_map prctl_map; +	struct prctl_mm_map prctl_map = { +		.auxv = NULL, +		.auxv_size = 0, +		.exe_fd = -1, +	};  	struct vm_area_struct *vma;  	int error; @@ -2125,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr,  	error = -EINVAL; -	down_write(&mm->mmap_sem); +	/* +	 * arg_lock protects concurent updates of arg boundaries, we need +	 * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr +	 * validation. +	 */ +	down_read(&mm->mmap_sem);  	vma = find_vma(mm, addr); +	spin_lock(&mm->arg_lock);  	prctl_map.start_code	= mm->start_code;  	prctl_map.end_code	= mm->end_code;  	prctl_map.start_data	= mm->start_data; @@ -2139,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr,  	prctl_map.arg_end	= mm->arg_end;  	prctl_map.env_start	= mm->env_start;  	prctl_map.env_end	= mm->env_end; -	prctl_map.auxv		= NULL; -	prctl_map.auxv_size	= 0; -	prctl_map.exe_fd	= -1;  	switch (opt) {  	case PR_SET_MM_START_CODE: @@ -2181,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr,  		goto out;  	} -	error = validate_prctl_map(&prctl_map); +	error = validate_prctl_map_addr(&prctl_map);  	if (error)  		goto out; @@ -2218,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr,  	error = 0;  out: -	up_write(&mm->mmap_sem); +	spin_unlock(&mm->arg_lock); +	up_read(&mm->mmap_sem);  	return error;  } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d21f4befaea4..4d9ae5ea6caf 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -167,9 +167,6 @@ COND_SYSCALL(syslog);  /* kernel/sched/core.c */ -/* kernel/signal.c */ -COND_SYSCALL(pidfd_send_signal); -  /* kernel/sys.c */  COND_SYSCALL(setregid);  COND_SYSCALL(setgid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5da394d1ca3..7d1008be6173 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * sysctl.c: General linux system control interface   * @@ -66,6 +67,7 @@  #include <linux/kexec.h>  #include <linux/bpf.h>  #include <linux/mount.h> +#include <linux/userfaultfd_k.h>  #include "../lib/kstrtox.h" @@ -128,6 +130,7 @@ static int zero;  static int __maybe_unused one = 1;  static int __maybe_unused two = 2;  static int __maybe_unused four = 4; +static unsigned long zero_ul;  static unsigned long one_ul = 1;  static unsigned long long_max = LONG_MAX;  static int one_hundred = 100; @@ -1719,6 +1722,17 @@ static struct ctl_table vm_table[] = {  		.extra2		= (void *)&mmap_rnd_compat_bits_max,  	},  #endif +#ifdef CONFIG_USERFAULTFD +	{ +		.procname	= "unprivileged_userfaultfd", +		.data		= &sysctl_unprivileged_userfaultfd, +		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +#endif  	{ }  }; @@ -1750,7 +1764,7 @@ static struct ctl_table fs_table[] = {  		.maxlen		= sizeof(files_stat.max_files),  		.mode		= 0644,  		.proc_handler	= proc_doulongvec_minmax, -		.extra1		= &zero, +		.extra1		= &zero_ul,  		.extra2		= &long_max,  	},  	{ @@ -2873,8 +2887,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int  			if (neg)  				continue;  			val = convmul * val / convdiv; -			if ((min && val < *min) || (max && val > *max)) -				continue; +			if ((min && val < *min) || (max && val > *max)) { +				err = -EINVAL; +				break; +			}  			*i = val;  		} else {  			val = convdiv * (*i) / convmul; @@ -3157,17 +3173,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  	if (write) {  		char *kbuf, *p; +		size_t skipped = 0; -		if (left > PAGE_SIZE - 1) +		if (left > PAGE_SIZE - 1) {  			left = PAGE_SIZE - 1; +			/* How much of the buffer we'll skip this pass */ +			skipped = *lenp - left; +		}  		p = kbuf = memdup_user_nul(buffer, left);  		if (IS_ERR(kbuf))  			return PTR_ERR(kbuf); -		tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), -				     sizeof(unsigned long), -				     GFP_KERNEL); +		tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);  		if (!tmp_bitmap) {  			kfree(kbuf);  			return -ENOMEM; @@ -3176,9 +3194,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  		while (!err && left) {  			unsigned long val_a, val_b;  			bool neg; +			size_t saved_left; +			/* In case we stop parsing mid-number, we can reset */ +			saved_left = left;  			err = proc_get_long(&p, &left, &val_a, &neg, tr_a,  					     sizeof(tr_a), &c); +			/* +			 * If we consumed the entirety of a truncated buffer or +			 * only one char is left (may be a "-"), then stop here, +			 * reset, & come back for more. +			 */ +			if ((left <= 1) && skipped) { +				left = saved_left; +				break; +			} +  			if (err)  				break;  			if (val_a >= bitmap_len || neg) { @@ -3196,6 +3227,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  				err = proc_get_long(&p, &left, &val_b,  						     &neg, tr_b, sizeof(tr_b),  						     &c); +				/* +				 * If we consumed all of a truncated buffer or +				 * then stop here, reset, & come back for more. +				 */ +				if (!left && skipped) { +					left = saved_left; +					break; +				} +  				if (err)  					break;  				if (val_b >= bitmap_len || neg || @@ -3214,6 +3254,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  			proc_skip_char(&p, &left, '\n');  		}  		kfree(kbuf); +		left += skipped;  	} else {  		unsigned long bit_a, bit_b = 0; @@ -3258,7 +3299,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  		*ppos += *lenp;  	} -	kfree(tmp_bitmap); +	bitmap_free(tmp_bitmap);  	return err;  } @@ -3325,6 +3366,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,      return -ENOSYS;  } +int proc_do_large_bitmap(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	return -ENOSYS; +}  #endif /* CONFIG_PROC_SYSCTL */ @@ -3365,3 +3411,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);  EXPORT_SYMBOL(proc_dostring);  EXPORT_SYMBOL(proc_doulongvec_minmax);  EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4e62a4a8fa91..13a0f2e6ebc2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -1,19 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * taskstats.c - Export per-task statistics to userland   *   * Copyright (C) Shailabh Nagar, IBM Corp. 2006   *           (C) Balbir Singh,   IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - *   */  #include <linux/kernel.h> @@ -375,7 +365,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)  			? TASKSTATS_TYPE_AGGR_PID  			: TASKSTATS_TYPE_AGGR_TGID; -	na = nla_nest_start(skb, aggr); +	na = nla_nest_start_noflag(skb, aggr);  	if (!na)  		goto err; @@ -649,17 +639,41 @@ err:  static const struct genl_ops taskstats_ops[] = {  	{  		.cmd		= TASKSTATS_CMD_GET, +		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit		= taskstats_user_cmd, -		.policy		= taskstats_cmd_get_policy, -		.flags		= GENL_ADMIN_PERM, +		/* policy enforced later */ +		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL,  	},  	{  		.cmd		= CGROUPSTATS_CMD_GET, +		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit		= cgroupstats_user_cmd, -		.policy		= cgroupstats_cmd_get_policy, +		/* policy enforced later */ +		.flags		= GENL_CMD_CAP_HASPOL,  	},  }; +static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, +			      struct genl_info *info) +{ +	const struct nla_policy *policy = NULL; + +	switch (ops->cmd) { +	case TASKSTATS_CMD_GET: +		policy = taskstats_cmd_get_policy; +		break; +	case CGROUPSTATS_CMD_GET: +		policy = cgroupstats_cmd_get_policy; +		break; +	default: +		return -EINVAL; +	} + +	return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, +					 TASKSTATS_CMD_ATTR_MAX, policy, +					 info->extack); +} +  static struct genl_family family __ro_after_init = {  	.name		= TASKSTATS_GENL_NAME,  	.version	= TASKSTATS_GENL_VERSION, @@ -667,6 +681,7 @@ static struct genl_family family __ro_after_init = {  	.module		= THIS_MODULE,  	.ops		= taskstats_ops,  	.n_ops		= ARRAY_SIZE(taskstats_ops), +	.pre_doit	= taskstats_pre_doit,  };  /* Needed early in initialization */ diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 7bca480151b0..76c997fdbc9d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * test_kprobes.c - simple sanity test for *probes   *   * Copyright IBM Corp. 2008 - * - * This program is free software;  you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details.   */  #define pr_fmt(fmt) "Kprobe smoke test: " fmt diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e2c038d6c13c..fcc42353f125 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # Timer subsystem related configuration options  # diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2c97e8c2d29f..0519a8805aab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)  {  	struct alarm *alarm = &timr->it.alarm.alarmtimer; -	return ktime_sub(now, alarm->node.expires); +	return ktime_sub(alarm->node.expires, now);  }  /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -611,6 +611,22 @@ void clockevents_resume(void)  }  #ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu:	The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ +	raw_spin_lock(&clockevents_lock); +	tick_broadcast_offline(cpu); +	raw_spin_unlock(&clockevents_lock); +} +# endif +  /**   * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu   */ @@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)  	raw_spin_lock_irqsave(&clockevents_lock, flags); -	tick_shutdown_broadcast_oneshot(cpu); -	tick_shutdown_broadcast(cpu);  	tick_shutdown(cpu);  	/*  	 * Unregister the clock event devices which were diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..d23b434c2ca7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);  #if (BITS_PER_LONG < 64)  u64 get_jiffies_64(void)  { -	unsigned long seq; +	unsigned int seq;  	u64 ret;  	do { @@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void)  	return &clocksource_jiffies;  } -struct clocksource refined_jiffies; +static struct clocksource refined_jiffies;  int register_refined_jiffies(long cycles_per_second)  { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 92a90014a925..8de4f789dc1b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,6 +17,7 @@  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/rtc.h> +#include <linux/audit.h>  #include "ntp_internal.h"  #include "timekeeping_internal.h" @@ -690,7 +691,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,  		time_constant = max(time_constant, 0l);  	} -	if (txc->modes & ADJ_TAI && txc->constant > 0) +	if (txc->modes & ADJ_TAI && txc->constant >= 0)  		*time_tai = txc->constant;  	if (txc->modes & ADJ_OFFSET) @@ -709,7 +710,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,   * kernel time-keeping variables. used by xntpd.   */  int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, -		  s32 *time_tai) +		  s32 *time_tai, struct audit_ntp_data *ad)  {  	int result; @@ -720,14 +721,29 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,  			/* adjtime() is independent from ntp_adjtime() */  			time_adjust = txc->offset;  			ntp_update_frequency(); + +			audit_ntp_set_old(ad, AUDIT_NTP_ADJUST,	save_adjust); +			audit_ntp_set_new(ad, AUDIT_NTP_ADJUST,	time_adjust);  		}  		txc->offset = save_adjust;  	} else { -  		/* If there are input parameters, then process them: */ -		if (txc->modes) +		if (txc->modes) { +			audit_ntp_set_old(ad, AUDIT_NTP_OFFSET,	time_offset); +			audit_ntp_set_old(ad, AUDIT_NTP_FREQ,	time_freq); +			audit_ntp_set_old(ad, AUDIT_NTP_STATUS,	time_status); +			audit_ntp_set_old(ad, AUDIT_NTP_TAI,	*time_tai); +			audit_ntp_set_old(ad, AUDIT_NTP_TICK,	tick_usec); +  			process_adjtimex_modes(txc, time_tai); +			audit_ntp_set_new(ad, AUDIT_NTP_OFFSET,	time_offset); +			audit_ntp_set_new(ad, AUDIT_NTP_FREQ,	time_freq); +			audit_ntp_set_new(ad, AUDIT_NTP_STATUS,	time_status); +			audit_ntp_set_new(ad, AUDIT_NTP_TAI,	*time_tai); +			audit_ntp_set_new(ad, AUDIT_NTP_TICK,	tick_usec); +		} +  		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,  				  NTP_SCALE_SHIFT);  		if (!(time_status & STA_NANO)) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 40e6122e634e..908ecaa65fc3 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,8 @@ extern void ntp_clear(void);  extern u64 ntp_tick_length(void);  extern ktime_t ntp_get_next_leap(void);  extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, +			 const struct timespec64 *ts, +			 s32 *time_tai, struct audit_ntp_data *ad);  extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);  #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..142b07619918 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)  unsigned long long notrace sched_clock(void)  {  	u64 cyc, res; -	unsigned long seq; +	unsigned int seq;  	struct clock_read_data *rd;  	do { @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)  	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))  		enable_sched_clock_irqtime(); -	pr_debug("Registered %pF as sched_clock source\n", read); +	pr_debug("Registered %pS as sched_clock source\n", read);  }  void __init generic_sched_clock_init(void) @@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void)   */  static u64 notrace suspended_sched_clock_read(void)  { -	unsigned long seq = raw_read_seqcount(&cd.seq); +	unsigned int seq = raw_read_seqcount(&cd.seq);  	return cd.read_data[seq & 1].epoch_cyc;  } -static int sched_clock_suspend(void) +int sched_clock_suspend(void)  {  	struct clock_read_data *rd = &cd.read_data[0]; @@ -283,7 +283,7 @@ static int sched_clock_suspend(void)  	return 0;  } -static void sched_clock_resume(void) +void sched_clock_resume(void)  {  	struct clock_read_data *rd = &cd.read_data[0]; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee834d4fb814..e51778c312f1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);  static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);  static void tick_broadcast_clear_oneshot(int cpu);  static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +# ifdef CONFIG_HOTPLUG_CPU +static void tick_broadcast_oneshot_offline(unsigned int cpu); +# endif  #else  static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }  static inline void tick_broadcast_clear_oneshot(int cpu) { }  static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +# ifdef CONFIG_HOTPLUG_CPU +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } +# endif  #endif  /* @@ -433,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)  }  #ifdef CONFIG_HOTPLUG_CPU -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int cpu) +static void tick_shutdown_broadcast(void)  { -	struct clock_event_device *bc; -	unsigned long flags; - -	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - -	bc = tick_broadcast_device.evtdev; -	cpumask_clear_cpu(cpu, tick_broadcast_mask); -	cpumask_clear_cpu(cpu, tick_broadcast_on); +	struct clock_event_device *bc = tick_broadcast_device.evtdev;  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {  		if (bc && cpumask_empty(tick_broadcast_mask))  			clockevents_shutdown(bc);  	} +} -	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ +	raw_spin_lock(&tick_broadcast_lock); +	cpumask_clear_cpu(cpu, tick_broadcast_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_on); +	tick_broadcast_oneshot_offline(cpu); +	tick_shutdown_broadcast(); +	raw_spin_unlock(&tick_broadcast_lock);  } +  #endif  void tick_suspend_broadcast(void) @@ -801,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)  			 * either the CPU handling the broadcast  			 * interrupt or we got woken by something else.  			 * -			 * We are not longer in the broadcast mask, so +			 * We are no longer in the broadcast mask, so  			 * if the cpu local expiry time is already  			 * reached, we would reprogram the cpu local  			 * timer with an already expired event.  			 *  			 * This can lead to a ping-pong when we return -			 * to idle and therefor rearm the broadcast +			 * to idle and therefore rearm the broadcast  			 * timer before the cpu local timer was able  			 * to fire. This happens because the forced  			 * reprogramming makes sure that the event @@ -950,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)  }  /* - * Remove a dead CPU from broadcasting + * Remove a dying CPU from broadcasting   */ -void tick_shutdown_broadcast_oneshot(unsigned int cpu) +static void tick_broadcast_oneshot_offline(unsigned int cpu)  { -	unsigned long flags; - -	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); -  	/*  	 * Clear the broadcast masks for the dead cpu, but do not stop  	 * the broadcast device! @@ -965,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)  	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);  	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);  	cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - -	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  }  #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 529143b4c8d2..59225b484e4e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -46,6 +46,14 @@ ktime_t tick_period;   *    procedure also covers cpu hotplug.   */  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +#ifdef CONFIG_NO_HZ_FULL +/* + * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns + * tick_do_timer_cpu and it should be taken over by an eligible secondary + * when one comes online. + */ +static int tick_do_timer_boot_cpu __read_mostly = -1; +#endif  /*   * Debugging: see timer_list.c @@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)  	    !tick_broadcast_oneshot_active()) {  		clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);  	} else { -		unsigned long seq; +		unsigned int seq;  		ktime_t next;  		do { @@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)  	}  } +#ifdef CONFIG_NO_HZ_FULL +static void giveup_do_timer(void *info) +{ +	int cpu = *(unsigned int *)info; + +	WARN_ON(tick_do_timer_cpu != smp_processor_id()); + +	tick_do_timer_cpu = cpu; +} + +static void tick_take_do_timer_from_boot(void) +{ +	int cpu = smp_processor_id(); +	int from = tick_do_timer_boot_cpu; + +	if (from >= 0 && from != cpu) +		smp_call_function_single(from, giveup_do_timer, &cpu, 1); +} +#endif +  /*   * Setup the tick device   */ @@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,  		 * this cpu:  		 */  		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { -			if (!tick_nohz_full_cpu(cpu)) -				tick_do_timer_cpu = cpu; -			else -				tick_do_timer_cpu = TICK_DO_TIMER_NONE; +			tick_do_timer_cpu = cpu; +  			tick_next_period = ktime_get();  			tick_period = NSEC_PER_SEC / HZ; +#ifdef CONFIG_NO_HZ_FULL +			/* +			 * The boot CPU may be nohz_full, in which case set +			 * tick_do_timer_boot_cpu so the first housekeeping +			 * secondary that comes up will take do_timer from +			 * us. +			 */ +			if (tick_nohz_full_cpu(cpu)) +				tick_do_timer_boot_cpu = cpu; + +		} else if (tick_do_timer_boot_cpu != -1 && +						!tick_nohz_full_cpu(cpu)) { +			tick_take_do_timer_from_boot(); +			tick_do_timer_boot_cpu = -1; +			WARN_ON(tick_do_timer_cpu != cpu); +#endif  		}  		/* @@ -487,6 +529,7 @@ void tick_freeze(void)  		trace_suspend_resume(TPS("timekeeping_freeze"),  				     smp_processor_id(), true);  		system_state = SYSTEM_SUSPEND; +		sched_clock_suspend();  		timekeeping_suspend();  	} else {  		tick_suspend_local(); @@ -510,6 +553,7 @@ void tick_unfreeze(void)  	if (tick_freeze_depth == num_online_cpus()) {  		timekeeping_resume(); +		sched_clock_resume();  		system_state = SYSTEM_RUNNING;  		trace_suspend_resume(TPS("timekeeping_freeze"),  				     smp_processor_id(), false); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);  extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);  extern void tick_install_broadcast_device(struct clock_event_device *dev);  extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu);  extern void tick_suspend_broadcast(void);  extern void tick_resume_broadcast(void);  extern bool tick_resume_check_broadcast(void); @@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)  static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }  static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }  static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { }  static inline void tick_suspend_broadcast(void) { }  static inline void tick_resume_broadcast(void) { }  static inline bool tick_resume_check_broadcast(void) { return false; } @@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }  /* Functions related to oneshot broadcasting */  #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)  extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);  extern int tick_broadcast_oneshot_active(void);  extern void tick_check_oneshot_broadcast_this_cpu(void);  bool tick_broadcast_oneshot_available(void);  extern struct cpumask *tick_get_broadcast_oneshot_mask(void);  #else /* !(BROADCAST && ONESHOT): */  static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }  static inline int tick_broadcast_oneshot_active(void) { return 0; }  static inline void tick_check_oneshot_broadcast_this_cpu(void) { }  static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }  #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif +  /* NO_HZ_FULL internal */  #ifdef CONFIG_NO_HZ_FULL  extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..f4ee1a3428ae 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)  	 * into a long sleep. If two CPUs happen to assign themselves to  	 * this duty, then the jiffies update is still serialized by  	 * jiffies_lock. +	 * +	 * If nohz_full is enabled, this should not happen because the +	 * tick_do_timer_cpu never relinquishes.  	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) -	    && !tick_nohz_full_cpu(cpu)) +	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { +#ifdef CONFIG_NO_HZ_FULL +		WARN_ON(tick_nohz_full_running); +#endif  		tick_do_timer_cpu = cpu; +	}  #endif  	/* Check, if the jiffies need an update */ @@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)  static int tick_nohz_cpu_down(unsigned int cpu)  {  	/* -	 * The boot CPU handles housekeeping duty (unbound timers, -	 * workqueues, timekeeping, ...) on behalf of full dynticks +	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound +	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks  	 * CPUs. It must remain online when nohz full is enabled.  	 */  	if (tick_nohz_full_running && tick_do_timer_cpu == cpu) @@ -423,12 +429,15 @@ void __init tick_nohz_init(void)  		return;  	} -	cpu = smp_processor_id(); +	if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && +			!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { +		cpu = smp_processor_id(); -	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { -		pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", -			cpu); -		cpumask_clear_cpu(cpu, tick_nohz_full_mask); +		if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { +			pr_warn("NO_HZ: Clearing %d from nohz_full range " +				"for timekeeping\n", cpu); +			cpumask_clear_cpu(cpu, tick_nohz_full_mask); +		}  	}  	for_each_cpu(cpu, tick_nohz_full_mask) @@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void)  static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)  {  	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; -	unsigned long seq, basejiff; +	unsigned long basejiff; +	unsigned int seq;  	/* Read jiffies and the time when jiffies were updated last */  	do { @@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  		/*  		 * Boot safety: make sure the timekeeping duty has been  		 * assigned before entering dyntick-idle mode, +		 * tick_do_timer_cpu is TICK_DO_TIMER_BOOT  		 */ -		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) +		if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) +			return false; + +		/* Should not happen for nohz-full */ +		if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))  			return false;  	} @@ -1023,6 +1038,18 @@ bool tick_nohz_idle_got_tick(void)  }  /** + * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer + * or the tick, whatever that expires first. Note that, if the tick has been + * stopped, it returns the next hrtimer. + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_next_hrtimer(void) +{ +	return __this_cpu_read(tick_cpu_device.evtdev)->next_event; +} + +/**   * tick_nohz_get_sleep_length - return the expected length of the current sleep   * @delta_next: duration until the next event if the tick cannot be stopped   * diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -24,12 +24,19 @@ enum tick_nohz_mode {   * struct tick_sched - sched tick emulation and no idle tick control/stats   * @sched_timer:	hrtimer to schedule the periodic tick in high   *			resolution mode + * @check_clocks:	Notification mechanism about clocksource changes + * @nohz_mode:		Mode - one state of tick_nohz_mode + * @inidle:		Indicator that the CPU is in the tick idle mode + * @tick_stopped:	Indicator that the idle tick has been stopped + * @idle_active:	Indicator that the CPU is actively in the tick idle mode; + *			it is resetted during irq handling phases. + * @do_timer_lst:	CPU was the last one doing do_timer before going idle + * @got_idle_tick:	Tick timer function has run with @inidle set   * @last_tick:		Store the last tick expiry time when the tick   *			timer is modified for nohz sleeps. This is necessary   *			to resume the tick timer operation in the timeline   *			when the CPU returns from nohz sleep.   * @next_tick:		Next tick to be fired when in dynticks mode. - * @tick_stopped:	Indicator that the idle tick has been stopped   * @idle_jiffies:	jiffies at the entry to idle for idle time accounting   * @idle_calls:		Total number of idle calls   * @idle_sleeps:	Number of idle calls, where the sched tick was stopped @@ -40,8 +47,8 @@ enum tick_nohz_mode {   * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding   * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)   * @timer_expires_base:	Base time clock monotonic for @timer_expires - * @do_timer_lst:	CPU was the last one doing do_timer before going idle - * @got_idle_tick:	Tick timer function has run with @inidle set + * @next_timer:		Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask:	Tick dependency mask - is set, if someone needs the tick   */  struct tick_sched {  	struct hrtimer			sched_timer; diff --git a/kernel/time/time.c b/kernel/time/time.c index c3f756f8534b..7f7d6914ddd5 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz  	static int firsttime = 1;  	int error = 0; -	if (tv && !timespec64_valid(tv)) +	if (tv && !timespec64_valid_settod(tv))  		return -EINVAL;  	error = security_settime64(tv, tz); @@ -783,6 +783,16 @@ u64 jiffies64_to_nsecs(u64 j)  }  EXPORT_SYMBOL(jiffies64_to_nsecs); +u64 jiffies64_to_msecs(const u64 j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +	return (MSEC_PER_SEC / HZ) * j; +#else +	return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_msecs); +  /**   * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64   * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..85f5912d8f70 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@  #include <linux/stop_machine.h>  #include <linux/pvclock_gtod.h>  #include <linux/compiler.h> +#include <linux/audit.h>  #include "tick-internal.h"  #include "ntp_internal.h" @@ -720,7 +721,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)  void ktime_get_real_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	u64 nsecs;  	WARN_ON(timekeeping_suspended); @@ -829,7 +830,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);  ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)  {  	ktime_t *offset = offsets[offs]; -	unsigned long seq; +	unsigned int seq;  	ktime_t tconv;  	do { @@ -960,7 +961,7 @@ time64_t __ktime_get_real_seconds(void)  void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	ktime_t base_raw;  	ktime_t base_real;  	u64 nsec_raw; @@ -1122,7 +1123,7 @@ int get_device_system_crosststamp(int (*get_time_fn)  	ktime_t base_real, base_raw;  	u64 nsec_real, nsec_raw;  	u8 cs_was_changed_seq; -	unsigned long seq; +	unsigned int seq;  	bool do_interp;  	int ret; @@ -1221,7 +1222,7 @@ int do_settimeofday64(const struct timespec64 *ts)  	unsigned long flags;  	int ret = 0; -	if (!timespec64_valid_strict(ts)) +	if (!timespec64_valid_settod(ts))  		return -EINVAL;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1250,6 +1251,9 @@ out:  	/* signal hrtimers about time change */  	clock_was_set(); +	if (!ret) +		audit_tk_injoffset(ts_delta); +  	return ret;  }  EXPORT_SYMBOL(do_settimeofday64); @@ -1278,7 +1282,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)  	/* Make sure the proposed value is valid */  	tmp = timespec64_add(tk_xtime(tk), *ts);  	if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || -	    !timespec64_valid_strict(&tmp)) { +	    !timespec64_valid_settod(&tmp)) {  		ret = -EINVAL;  		goto error;  	} @@ -1409,7 +1413,7 @@ int timekeeping_notify(struct clocksource *clock)  void ktime_get_raw_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	u64 nsecs;  	do { @@ -1431,7 +1435,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);  int timekeeping_valid_for_hres(void)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	int ret;  	do { @@ -1450,7 +1454,7 @@ int timekeeping_valid_for_hres(void)  u64 timekeeping_max_deferment(void)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	u64 ret;  	do { @@ -1527,7 +1531,7 @@ void __init timekeeping_init(void)  	unsigned long flags;  	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); -	if (timespec64_valid_strict(&wall_time) && +	if (timespec64_valid_settod(&wall_time) &&  	    timespec64_to_ns(&wall_time) > 0) {  		persistent_clock_exists = true;  	} else if (timespec64_to_ns(&wall_time) != 0) { @@ -2150,7 +2154,7 @@ EXPORT_SYMBOL_GPL(getboottime64);  void ktime_get_coarse_real_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	do {  		seq = read_seqcount_begin(&tk_core.seq); @@ -2164,7 +2168,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper;  	struct timespec64 now, mono; -	unsigned long seq; +	unsigned int seq;  	do {  		seq = read_seqcount_begin(&tk_core.seq); @@ -2303,6 +2307,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)  int do_adjtimex(struct __kernel_timex *txc)  {  	struct timekeeper *tk = &tk_core.timekeeper; +	struct audit_ntp_data ad;  	unsigned long flags;  	struct timespec64 ts;  	s32 orig_tai, tai; @@ -2322,15 +2327,19 @@ int do_adjtimex(struct __kernel_timex *txc)  		ret = timekeeping_inject_offset(&delta);  		if (ret)  			return ret; + +		audit_tk_injoffset(delta);  	} +	audit_ntp_init(&ad); +  	ktime_get_real_ts64(&ts);  	raw_spin_lock_irqsave(&timekeeper_lock, flags);  	write_seqcount_begin(&tk_core.seq);  	orig_tai = tai = tk->tai_offset; -	ret = __do_adjtimex(txc, &ts, &tai); +	ret = __do_adjtimex(txc, &ts, &tai, &ad);  	if (tai != orig_tai) {  		__timekeeping_set_tai_offset(tk, tai); @@ -2341,6 +2350,8 @@ int do_adjtimex(struct __kernel_timex *txc)  	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +	audit_ntp_log(&ad); +  	/* Update the multiplier immediately if frequency was set directly */  	if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))  		timekeeping_advance(TK_ADV_FREQ); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..141ab3ab0354 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void);  extern void timekeeping_warp_clock(void);  extern int timekeeping_suspend(void);  extern void timekeeping_resume(void); +#ifdef CONFIG_GENERIC_SCHED_CLOCK +extern int sched_clock_suspend(void); +extern void sched_clock_resume(void); +#else +static inline int sched_clock_suspend(void) { return 0; } +static inline void sched_clock_resume(void) { } +#endif  extern void do_timer(unsigned long ticks);  extern void update_wall_time(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..343c7ba33b1c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,  	hlist_add_head(&timer->entry, base->vectors + idx);  	__set_bit(idx, base->pending_map);  	timer_set_idx(timer, idx); + +	trace_timer_start(timer, timer->expires, timer->flags);  }  static void @@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)  	trace_timer_init(timer);  } -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ -	debug_timer_activate(timer); -	trace_timer_start(timer, expires, timer->flags); -} -  static inline void debug_deactivate(struct timer_list *timer)  {  	debug_timer_deactivate(timer); @@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option  		}  	} -	debug_activate(timer, expires); +	debug_timer_activate(timer);  	timer->expires = expires;  	/* @@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)  	}  	forward_timer_base(base); -	debug_activate(timer, timer->expires); +	debug_timer_activate(timer);  	internal_add_timer(base, timer);  	raw_spin_unlock_irqrestore(&base->lock, flags);  } @@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)  EXPORT_SYMBOL(del_timer_sync);  #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +static void call_timer_fn(struct timer_list *timer, +			  void (*fn)(struct timer_list *), +			  unsigned long baseclk)  {  	int count = preempt_count(); @@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list  	 */  	lock_map_acquire(&lockdep_map); -	trace_timer_expire_entry(timer); +	trace_timer_expire_entry(timer, baseclk);  	fn(timer);  	trace_timer_expire_exit(timer);  	lock_map_release(&lockdep_map);  	if (count != preempt_count()) { -		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", +		WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",  			  fn, count, preempt_count());  		/*  		 * Restore the preempt count. That gives us a decent @@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list  static void expire_timers(struct timer_base *base, struct hlist_head *head)  { +	/* +	 * This value is required only for tracing. base->clk was +	 * incremented directly before expire_timers was called. But expiry +	 * is related to the old base->clk value. +	 */ +	unsigned long baseclk = base->clk - 1; +  	while (!hlist_empty(head)) {  		struct timer_list *timer;  		void (*fn)(struct timer_list *); @@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)  		if (timer->flags & TIMER_IRQSAFE) {  			raw_spin_unlock(&base->lock); -			call_timer_fn(timer, fn); +			call_timer_fn(timer, fn, baseclk);  			raw_spin_lock(&base->lock);  		} else {  			raw_spin_unlock_irq(&base->lock); -			call_timer_fn(timer, fn); +			call_timer_fn(timer, fn, baseclk);  			raw_spin_lock_irq(&base->lock);  		}  	} diff --git a/kernel/torture.c b/kernel/torture.c index 8faa1a9aaeb9..17b2be9bde12 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,  	if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))  		return false; +	if (num_online_cpus() <= 1) +		return false;  /* Can't offline the last CPU. */  	if (verbose > 1)  		pr_alert("%s" TORTURE_FLAG diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8bd1d6d001d7..564e5fdb025f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only  #  # Architectures that offer an FUNCTION_TRACER implementation should  #  select HAVE_FUNCTION_TRACER: @@ -774,13 +775,6 @@ config TRACE_EVAL_MAP_FILE  	If unsure, say N -config TRACING_EVENTS_GPIO -	bool "Trace gpio events" -	depends on GPIOLIB -	default y -	help -	  Enable tracing events for gpio subsystem -  config GCOV_PROFILE_FTRACE  	bool "Enable GCOV profiling on ftrace subsystem"  	depends on GCOV_KERNEL diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..f92d6ad5e080 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,8 @@  #include <linux/syscalls.h>  #include <linux/error-injection.h> +#include <asm/tlb.h> +  #include "trace_probe.h"  #include "trace.h" @@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,  	 * access_ok() should prevent writing to non-user memory, but in  	 * some situations (nommu, temporary switch, etc) access_ok() does  	 * not provide enough validation, hence the check on KERNEL_DS. +	 * +	 * nmi_uaccess_okay() ensures the probe is not run in an interim +	 * state, when the task or mm are switched. This is specifically +	 * required to prevent the use of temporary mm.  	 */  	if (unlikely(in_interrupt() || @@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,  		return -EPERM;  	if (unlikely(uaccess_kernel()))  		return -EPERM; +	if (unlikely(!nmi_uaccess_okay())) +		return -EPERM;  	if (!access_ok(unsafe_ptr, size))  		return -EPERM; @@ -569,6 +577,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_map_update_elem_proto;  	case BPF_FUNC_map_delete_elem:  		return &bpf_map_delete_elem_proto; +	case BPF_FUNC_map_push_elem: +		return &bpf_map_push_elem_proto; +	case BPF_FUNC_map_pop_elem: +		return &bpf_map_pop_elem_proto; +	case BPF_FUNC_map_peek_elem: +		return &bpf_map_peek_elem_proto;  	case BPF_FUNC_probe_read:  		return &bpf_probe_read_proto;  	case BPF_FUNC_ktime_get_ns: @@ -909,6 +923,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {  const struct bpf_prog_ops raw_tracepoint_prog_ops = {  }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, +						 enum bpf_access_type type, +						 const struct bpf_prog *prog, +						 struct bpf_insn_access_aux *info) +{ +	if (off == 0) { +		if (size != sizeof(u64) || type != BPF_READ) +			return false; +		info->reg_type = PTR_TO_TP_BUFFER; +	} +	return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { +	.get_func_proto  = raw_tp_prog_func_proto, +	.is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; +  static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,  				    const struct bpf_prog *prog,  				    struct bpf_insn_access_aux *info) @@ -1198,6 +1233,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *  	if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))  		return -EINVAL; +	if (prog->aux->max_tp_access > btp->writable_size) +		return -EINVAL; +  	return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);  } @@ -1259,7 +1297,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,  }  #ifdef CONFIG_MODULES -int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +static int bpf_event_notify(struct notifier_block *nb, unsigned long op, +			    void *module)  {  	struct bpf_trace_module *btm, *tmp;  	struct module *mod = module; @@ -1298,7 +1337,7 @@ static struct notifier_block bpf_module_nb = {  	.notifier_call = bpf_event_notify,  }; -int __init bpf_event_init(void) +static int __init bpf_event_init(void)  {  	register_module_notifier(&bpf_module_nb);  	return 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa79323331b2..a12aff849c04 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@  #include <linux/list.h>  #include <linux/hash.h>  #include <linux/rcupdate.h> +#include <linux/kprobes.h>  #include <trace/events/sched.h> @@ -69,12 +70,8 @@  #define INIT_OPS_HASH(opsname)	\  	.func_hash		= &opsname.local_hash,			\  	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), -#define ASSIGN_OPS_HASH(opsname, val) \ -	.func_hash		= val, \ -	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),  #else  #define INIT_OPS_HASH(opsname) -#define ASSIGN_OPS_HASH(opsname, val)  #endif  enum { @@ -1992,7 +1989,7 @@ static void print_bug_type(void)   * modifying the code. @failed should be one of either:   * EFAULT - if the problem happens on reading the @ip address   * EINVAL - if what is read at @ip is not what was expected - * EPERM - if the problem happens on writting to the @ip address + * EPERM - if the problem happens on writing to the @ip address   */  void ftrace_bug(int failed, struct dyn_ftrace *rec)  { @@ -2391,7 +2388,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)  		return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);  	} -	return -1; /* unknow ftrace bug */ +	return -1; /* unknown ftrace bug */  }  void __weak ftrace_replace_code(int mod_flags) @@ -3004,7 +3001,7 @@ ftrace_allocate_pages(unsigned long num_to_init)  	int cnt;  	if (!num_to_init) -		return 0; +		return NULL;  	start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);  	if (!pg) @@ -3879,7 +3876,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,  static bool module_exists(const char *module)  {  	/* All modules have the symbol __this_module */ -	const char this_mod[] = "__this_module"; +	static const char this_mod[] = "__this_module";  	char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];  	unsigned long val;  	int n; @@ -4755,7 +4752,7 @@ static int  ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,  		int reset, int enable)  { -	return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +	return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);  }  /** @@ -5463,7 +5460,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,  /*   * The name "destroy_filter_files" is really a misnomer. Although - * in the future, it may actualy delete the files, but this is + * in the future, it may actually delete the files, but this is   * really intended to make sure the ops passed in are disabled   * and that when this function returns, the caller is free to   * free the ops. @@ -5786,7 +5783,7 @@ void ftrace_module_enable(struct module *mod)  	/*  	 * If the tracing is enabled, go ahead and enable the record.  	 * -	 * The reason not to enable the record immediatelly is the +	 * The reason not to enable the record immediately is the  	 * inherent check of ftrace_make_nop/ftrace_make_call for  	 * correct previous instructions.  Making first the NOP  	 * conversion puts the module to the correct state, thus @@ -6246,7 +6243,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)  	tr->ops->func = ftrace_stub;  } -static inline void +static nokprobe_inline void  __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  		       struct ftrace_ops *ignored, struct pt_regs *regs)  { @@ -6264,6 +6261,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  	preempt_disable_notrace();  	do_for_each_ftrace_op(op, ftrace_ops_list) { +		/* Stub functions don't need to be called nor tested */ +		if (op->flags & FTRACE_OPS_FL_STUB) +			continue;  		/*  		 * Check the following for each ops before calling their func:  		 *  if RCU flag is set, then rcu_is_watching() must be true @@ -6306,11 +6306,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  {  	__ftrace_ops_list_func(ip, parent_ip, NULL, regs);  } +NOKPROBE_SYMBOL(ftrace_ops_list_func);  #else  static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)  {  	__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);  } +NOKPROBE_SYMBOL(ftrace_ops_no_ops);  #endif  /* @@ -6337,6 +6339,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,  	preempt_enable_notrace();  	trace_clear_recursion(bit);  } +NOKPROBE_SYMBOL(ftrace_ops_assist_func);  /**   * ftrace_ops_get_func - get the function a trampoline should call diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..05b0b3139ebc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)  	preempt_disable_notrace();  	time = rb_time_stamp(buffer); -	preempt_enable_no_resched_notrace(); +	preempt_enable_notrace();  	return time;  } @@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)  	cnt = data->cnt + (nested ? 27 : 0);  	/* Multiply cnt by ~e, to make some unique increment */ -	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); +	size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);  	len = size + sizeof(struct rb_item); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index ffba6789c0e2..0564f6db0561 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -362,7 +362,7 @@ static void ring_buffer_producer(void)  			hit--; /* make it non zero */  		} -		/* Caculate the average time in nanosecs */ +		/* Calculate the average time in nanosecs */  		avg = NSEC_PER_MSEC / (hit + missed);  		trace_printk("%ld ns per entry\n", avg);  	} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..1c80521fd436 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;  #endif /* CONFIG_TRACE_EVAL_MAP_FILE */  static int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct ring_buffer *buffer, +				   unsigned long flags, int pc);  #define MAX_TRACER_SIZE		100  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,  	 * not modified.  	 */  	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); -	if (!pid_list) +	if (!pid_list) { +		trace_parser_put(&parser);  		return -ENOMEM; +	}  	pid_list->pid_max = READ_ONCE(pid_max); @@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,  	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);  	if (!pid_list->pids) { +		trace_parser_put(&parser);  		kfree(pid_list);  		return -ENOMEM;  	} @@ -1722,6 +1727,10 @@ static __init int init_trace_selftests(void)  	pr_info("Running postponed tracer tests:\n");  	list_for_each_entry_safe(p, n, &postponed_selftests, list) { +		/* This loop can take minutes when sanitizers are enabled, so +		 * lets make sure we allow RCU processing. +		 */ +		cond_resched();  		ret = run_tracer_selftest(p->type);  		/* If the test fails, then warn and remove from available_tracers */  		if (ret < 0) { @@ -2749,12 +2758,21 @@ trace_function(struct trace_array *tr,  #ifdef CONFIG_STACKTRACE -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING	4 + +#define FTRACE_KSTACK_ENTRIES	(PAGE_SIZE / FTRACE_KSTACK_NESTING) +  struct ftrace_stack { -	unsigned long		calls[FTRACE_STACK_MAX_ENTRIES]; +	unsigned long		calls[FTRACE_KSTACK_ENTRIES]; +}; + + +struct ftrace_stacks { +	struct ftrace_stack	stacks[FTRACE_KSTACK_NESTING];  }; -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);  static DEFINE_PER_CPU(int, ftrace_stack_reserve);  static void __ftrace_trace_stack(struct ring_buffer *buffer, @@ -2763,13 +2781,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  {  	struct trace_event_call *call = &event_kernel_stack;  	struct ring_buffer_event *event; +	unsigned int size, nr_entries; +	struct ftrace_stack *fstack;  	struct stack_entry *entry; -	struct stack_trace trace; -	int use_stack; -	int size = FTRACE_STACK_ENTRIES; - -	trace.nr_entries	= 0; -	trace.skip		= skip; +	int stackidx;  	/*  	 * Add one, for this function and the call to save_stack_trace() @@ -2777,7 +2792,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	 */  #ifndef CONFIG_UNWINDER_ORC  	if (!regs) -		trace.skip++; +		skip++;  #endif  	/* @@ -2788,53 +2803,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	 */  	preempt_disable_notrace(); -	use_stack = __this_cpu_inc_return(ftrace_stack_reserve); +	stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + +	/* This should never happen. If it does, yell once and skip */ +	if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) +		goto out; +  	/* -	 * We don't need any atomic variables, just a barrier. -	 * If an interrupt comes in, we don't care, because it would -	 * have exited and put the counter back to what we want. -	 * We just need a barrier to keep gcc from moving things -	 * around. +	 * The above __this_cpu_inc_return() is 'atomic' cpu local. An +	 * interrupt will either see the value pre increment or post +	 * increment. If the interrupt happens pre increment it will have +	 * restored the counter when it returns.  We just need a barrier to +	 * keep gcc from moving things around.  	 */  	barrier(); -	if (use_stack == 1) { -		trace.entries		= this_cpu_ptr(ftrace_stack.calls); -		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES; -		if (regs) -			save_stack_trace_regs(regs, &trace); -		else -			save_stack_trace(&trace); - -		if (trace.nr_entries > size) -			size = trace.nr_entries; -	} else -		/* From now on, use_stack is a boolean */ -		use_stack = 0; +	fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; +	size = ARRAY_SIZE(fstack->calls); -	size *= sizeof(unsigned long); +	if (regs) { +		nr_entries = stack_trace_save_regs(regs, fstack->calls, +						   size, skip); +	} else { +		nr_entries = stack_trace_save(fstack->calls, size, skip); +	} +	size = nr_entries * sizeof(unsigned long);  	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,  					    sizeof(*entry) + size, flags, pc);  	if (!event)  		goto out;  	entry = ring_buffer_event_data(event); -	memset(&entry->caller, 0, size); - -	if (use_stack) -		memcpy(&entry->caller, trace.entries, -		       trace.nr_entries * sizeof(unsigned long)); -	else { -		trace.max_entries	= FTRACE_STACK_ENTRIES; -		trace.entries		= entry->caller; -		if (regs) -			save_stack_trace_regs(regs, &trace); -		else -			save_stack_trace(&trace); -	} - -	entry->size = trace.nr_entries; +	memcpy(&entry->caller, fstack->calls, size); +	entry->size = nr_entries;  	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event); @@ -2904,15 +2906,15 @@ void trace_dump_stack(int skip)  }  EXPORT_SYMBOL_GPL(trace_dump_stack); +#ifdef CONFIG_USER_STACKTRACE_SUPPORT  static DEFINE_PER_CPU(int, user_stack_count); -void +static void  ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  {  	struct trace_event_call *call = &event_user_stack;  	struct ring_buffer_event *event;  	struct userstack_entry *entry; -	struct stack_trace trace;  	if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))  		return; @@ -2943,12 +2945,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	entry->tgid		= current->tgid;  	memset(&entry->caller, 0, sizeof(entry->caller)); -	trace.nr_entries	= 0; -	trace.max_entries	= FTRACE_STACK_ENTRIES; -	trace.skip		= 0; -	trace.entries		= entry->caller; - -	save_stack_trace_user(&trace); +	stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);  	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event); @@ -2957,13 +2954,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)   out:  	preempt_enable();  } - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct ring_buffer *buffer, +				   unsigned long flags, int pc)  { -	ftrace_trace_userstack(tr, flags, preempt_count());  } -#endif /* UNUSED */ +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */  #endif /* CONFIG_STACKTRACE */ @@ -3053,6 +3049,7 @@ void trace_printk_init_buffers(void)  	if (global_trace.trace_buffer.buffer)  		tracing_start_cmdline_record();  } +EXPORT_SYMBOL_GPL(trace_printk_init_buffers);  void trace_printk_start_comm(void)  { @@ -3213,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr,  	va_end(ap);  	return ret;  } +EXPORT_SYMBOL_GPL(trace_array_printk);  __printf(3, 4)  int trace_array_printk_buf(struct ring_buffer *buffer, @@ -3491,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p)  }  static void +get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, +		      unsigned long *entries, int cpu) +{ +	unsigned long count; + +	count = ring_buffer_entries_cpu(buf->buffer, cpu); +	/* +	 * If this buffer has skipped entries, then we hold all +	 * entries for the trace and we need to ignore the +	 * ones before the time stamp. +	 */ +	if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { +		count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; +		/* total is the same as the entries */ +		*total = count; +	} else +		*total = count + +			ring_buffer_overrun_cpu(buf->buffer, cpu); +	*entries = count; +} + +static void  get_total_entries(struct trace_buffer *buf,  		  unsigned long *total, unsigned long *entries)  { -	unsigned long count; +	unsigned long t, e;  	int cpu;  	*total = 0;  	*entries = 0;  	for_each_tracing_cpu(cpu) { -		count = ring_buffer_entries_cpu(buf->buffer, cpu); -		/* -		 * If this buffer has skipped entries, then we hold all -		 * entries for the trace and we need to ignore the -		 * ones before the time stamp. -		 */ -		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { -			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; -			/* total is the same as the entries */ -			*total += count; -		} else -			*total += count + -				ring_buffer_overrun_cpu(buf->buffer, cpu); -		*entries += count; +		get_total_entries_cpu(buf, &t, &e, cpu); +		*total += t; +		*entries += e;  	}  } +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ +	unsigned long total, entries; + +	if (!tr) +		tr = &global_trace; + +	get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); + +	return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ +	unsigned long total, entries; + +	if (!tr) +		tr = &global_trace; + +	get_total_entries(&tr->trace_buffer, &total, &entries); + +	return entries; +} +  static void print_lat_help_header(struct seq_file *m)  {  	seq_puts(m, "#                  _------=> CPU#            \n" @@ -3556,25 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file  				       unsigned int flags)  {  	bool tgid = flags & TRACE_ITER_RECORD_TGID; -	const char tgid_space[] = "          "; -	const char space[] = "  "; +	const char *space = "          "; +	int prec = tgid ? 10 : 2;  	print_event_info(buf, m); -	seq_printf(m, "#                          %s  _-----=> irqs-off\n", -		   tgid ? tgid_space : space); -	seq_printf(m, "#                          %s / _----=> need-resched\n", -		   tgid ? tgid_space : space); -	seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n", -		   tgid ? tgid_space : space); -	seq_printf(m, "#                          %s|| / _--=> preempt-depth\n", -		   tgid ? tgid_space : space); -	seq_printf(m, "#                          %s||| /     delay\n", -		   tgid ? tgid_space : space); -	seq_printf(m, "#           TASK-PID %sCPU#  ||||    TIMESTAMP  FUNCTION\n", -		   tgid ? "   TGID   " : space); -	seq_printf(m, "#              | |   %s  |   ||||       |         |\n", -		   tgid ? "     |    " : space); +	seq_printf(m, "#                          %.*s  _-----=> irqs-off\n", prec, space); +	seq_printf(m, "#                          %.*s / _----=> need-resched\n", prec, space); +	seq_printf(m, "#                          %.*s| / _---=> hardirq/softirq\n", prec, space); +	seq_printf(m, "#                          %.*s|| / _--=> preempt-depth\n", prec, space); +	seq_printf(m, "#                          %.*s||| /     delay\n", prec, space); +	seq_printf(m, "#           TASK-PID %.*sCPU#  ||||    TIMESTAMP  FUNCTION\n", prec, "   TGID   "); +	seq_printf(m, "#              | |   %.*s  |   ||||       |         |\n", prec, "     |    ");  }  void @@ -4700,6 +4726,7 @@ static const char readme_msg[] =  	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"  	"  current_tracer\t- function and latency tracers\n"  	"  available_tracers\t- list of configured tracers for current_tracer\n" +	"  error_log\t- error log for failed commands (that support it)\n"  	"  buffer_size_kb\t- view and modify size of per cpu buffer\n"  	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n"  	"  trace_clock\t\t-change the clock used to order events\n" @@ -4720,7 +4747,7 @@ static const char readme_msg[] =  	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"  	"\t\t\t  Remove sub-buffer with rmdir\n"  	"  trace_options\t\t- Set format or modify how tracing happens\n" -	"\t\t\t  Disable an option by adding a suffix 'no' to the\n" +	"\t\t\t  Disable an option by prefixing 'no' to the\n"  	"\t\t\t  option name\n"  	"  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"  #ifdef CONFIG_DYNAMIC_FTRACE @@ -6304,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	struct ring_buffer *buffer;  	struct print_entry *entry;  	unsigned long irq_flags; -	const char faulted[] = "<faulted>";  	ssize_t written;  	int size;  	int len;  /* Used in tracing_mark_raw_write() as well */ -#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ +#define FAULTED_STR "<faulted>" +#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */  	if (tracing_disabled)  		return -EINVAL; @@ -6342,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);  	if (len) { -		memcpy(&entry->buf, faulted, FAULTED_SIZE); +		memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);  		cnt = FAULTED_SIZE;  		written = -EFAULT;  	} else @@ -6383,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	struct raw_data_entry *entry; -	const char faulted[] = "<faulted>";  	unsigned long irq_flags;  	ssize_t written;  	int size; @@ -6423,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,  	len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);  	if (len) {  		entry->id = -1; -		memcpy(&entry->buf, faulted, FAULTED_SIZE); +		memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);  		written = -EFAULT;  	} else  		written = cnt; @@ -6876,6 +6902,238 @@ static const struct file_operations snapshot_raw_fops = {  #endif /* CONFIG_TRACER_SNAPSHOT */ +#define TRACING_LOG_ERRS_MAX	8 +#define TRACING_LOG_LOC_MAX	128 + +#define CMD_PREFIX "  Command: " + +struct err_info { +	const char	**errs;	/* ptr to loc-specific array of err strings */ +	u8		type;	/* index into errs -> specific err string */ +	u8		pos;	/* MAX_FILTER_STR_VAL = 256 */ +	u64		ts; +}; + +struct tracing_log_err { +	struct list_head	list; +	struct err_info		info; +	char			loc[TRACING_LOG_LOC_MAX]; /* err location */ +	char			cmd[MAX_FILTER_STR_VAL]; /* what caused err */ +}; + +static DEFINE_MUTEX(tracing_err_log_lock); + +struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +{ +	struct tracing_log_err *err; + +	if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { +		err = kzalloc(sizeof(*err), GFP_KERNEL); +		if (!err) +			err = ERR_PTR(-ENOMEM); +		tr->n_err_log_entries++; + +		return err; +	} + +	err = list_first_entry(&tr->err_log, struct tracing_log_err, list); +	list_del(&err->list); + +	return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurence of @str within @cmd.  The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ +	char *found; + +	if (WARN_ON(!strlen(cmd))) +		return 0; + +	found = strstr(cmd, str); +	if (found) +		return found - cmd; + +	return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * <loc>: error: <text> + *   Command: <cmd> + *              ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8).  Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved.  See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(struct trace_array *tr, +		     const char *loc, const char *cmd, +		     const char **errs, u8 type, u8 pos) +{ +	struct tracing_log_err *err; + +	if (!tr) +		tr = &global_trace; + +	mutex_lock(&tracing_err_log_lock); +	err = get_tracing_log_err(tr); +	if (PTR_ERR(err) == -ENOMEM) { +		mutex_unlock(&tracing_err_log_lock); +		return; +	} + +	snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); +	snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + +	err->info.errs = errs; +	err->info.type = type; +	err->info.pos = pos; +	err->info.ts = local_clock(); + +	list_add_tail(&err->list, &tr->err_log); +	mutex_unlock(&tracing_err_log_lock); +} + +static void clear_tracing_err_log(struct trace_array *tr) +{ +	struct tracing_log_err *err, *next; + +	mutex_lock(&tracing_err_log_lock); +	list_for_each_entry_safe(err, next, &tr->err_log, list) { +		list_del(&err->list); +		kfree(err); +	} + +	tr->n_err_log_entries = 0; +	mutex_unlock(&tracing_err_log_lock); +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ +	struct trace_array *tr = m->private; + +	mutex_lock(&tracing_err_log_lock); + +	return seq_list_start(&tr->err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ +	struct trace_array *tr = m->private; + +	return seq_list_next(v, &tr->err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ +	mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +{ +	u8 i; + +	for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) +		seq_putc(m, ' '); +	for (i = 0; i < pos; i++) +		seq_putc(m, ' '); +	seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ +	struct tracing_log_err *err = v; + +	if (err) { +		const char *err_text = err->info.errs[err->info.type]; +		u64 sec = err->info.ts; +		u32 nsec; + +		nsec = do_div(sec, NSEC_PER_SEC); +		seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, +			   err->loc, err_text); +		seq_printf(m, "%s", err->cmd); +		tracing_err_log_show_pos(m, err->info.pos); +	} + +	return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { +	.start  = tracing_err_log_seq_start, +	.next   = tracing_err_log_seq_next, +	.stop   = tracing_err_log_seq_stop, +	.show   = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; +	int ret = 0; + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	/* If this file was opened for write, then erase contents */ +	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) +		clear_tracing_err_log(tr); + +	if (file->f_mode & FMODE_READ) { +		ret = seq_open(file, &tracing_err_log_seq_ops); +		if (!ret) { +			struct seq_file *m = file->private_data; +			m->private = tr; +		} else { +			trace_array_put(tr); +		} +	} +	return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, +				     const char __user *buffer, +				     size_t count, loff_t *ppos) +{ +	return count; +} + +static const struct file_operations tracing_err_log_fops = { +	.open           = tracing_err_log_open, +	.write		= tracing_err_log_write, +	.read           = seq_read, +	.llseek         = seq_lseek, +	.release	= tracing_release_generic_tr, +}; +  static int tracing_buffers_open(struct inode *inode, struct file *filp)  {  	struct trace_array *tr = inode->i_private; @@ -7025,35 +7283,43 @@ struct buffer_ref {  	struct ring_buffer	*buffer;  	void			*page;  	int			cpu; -	int			ref; +	refcount_t		refcount;  }; +static void buffer_ref_release(struct buffer_ref *ref) +{ +	if (!refcount_dec_and_test(&ref->refcount)) +		return; +	ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); +	kfree(ref); +} +  static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,  				    struct pipe_buffer *buf)  {  	struct buffer_ref *ref = (struct buffer_ref *)buf->private; -	if (--ref->ref) -		return; - -	ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); -	kfree(ref); +	buffer_ref_release(ref);  	buf->private = 0;  } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,  				struct pipe_buffer *buf)  {  	struct buffer_ref *ref = (struct buffer_ref *)buf->private; -	ref->ref++; +	if (refcount_read(&ref->refcount) > INT_MAX/2) +		return false; + +	refcount_inc(&ref->refcount); +	return true;  }  /* Pipe buffer operations for a buffer. */  static const struct pipe_buf_operations buffer_pipe_buf_ops = {  	.confirm		= generic_pipe_buf_confirm,  	.release		= buffer_pipe_buf_release, -	.steal			= generic_pipe_buf_steal, +	.steal			= generic_pipe_buf_nosteal,  	.get			= buffer_pipe_buf_get,  }; @@ -7066,11 +7332,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)  	struct buffer_ref *ref =  		(struct buffer_ref *)spd->partial[i].private; -	if (--ref->ref) -		return; - -	ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); -	kfree(ref); +	buffer_ref_release(ref);  	spd->partial[i].private = 0;  } @@ -7125,7 +7387,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			break;  		} -		ref->ref = 1; +		refcount_set(&ref->refcount, 1);  		ref->buffer = iter->trace_buffer->buffer;  		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);  		if (IS_ERR(ref->page)) { @@ -8037,7 +8299,7 @@ static void update_tracer_options(struct trace_array *tr)  	mutex_unlock(&trace_types_lock);  } -static int instance_mkdir(const char *name) +struct trace_array *trace_array_create(const char *name)  {  	struct trace_array *tr;  	int ret; @@ -8076,6 +8338,7 @@ static int instance_mkdir(const char *name)  	INIT_LIST_HEAD(&tr->systems);  	INIT_LIST_HEAD(&tr->events);  	INIT_LIST_HEAD(&tr->hist_vars); +	INIT_LIST_HEAD(&tr->err_log);  	if (allocate_trace_buffers(tr, trace_buf_size) < 0)  		goto out_free_tr; @@ -8101,7 +8364,7 @@ static int instance_mkdir(const char *name)  	mutex_unlock(&trace_types_lock);  	mutex_unlock(&event_mutex); -	return 0; +	return tr;   out_free_tr:  	free_trace_buffers(tr); @@ -8113,33 +8376,21 @@ static int instance_mkdir(const char *name)  	mutex_unlock(&trace_types_lock);  	mutex_unlock(&event_mutex); -	return ret; +	return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(trace_array_create); +static int instance_mkdir(const char *name) +{ +	return PTR_ERR_OR_ZERO(trace_array_create(name));  } -static int instance_rmdir(const char *name) +static int __remove_instance(struct trace_array *tr)  { -	struct trace_array *tr; -	int found = 0; -	int ret;  	int i; -	mutex_lock(&event_mutex); -	mutex_lock(&trace_types_lock); - -	ret = -ENODEV; -	list_for_each_entry(tr, &ftrace_trace_arrays, list) { -		if (tr->name && strcmp(tr->name, name) == 0) { -			found = 1; -			break; -		} -	} -	if (!found) -		goto out_unlock; - -	ret = -EBUSY;  	if (tr->ref || (tr->current_trace && tr->current_trace->ref)) -		goto out_unlock; +		return -EBUSY;  	list_del(&tr->list); @@ -8165,10 +8416,46 @@ static int instance_rmdir(const char *name)  	free_cpumask_var(tr->tracing_cpumask);  	kfree(tr->name);  	kfree(tr); +	tr = NULL; -	ret = 0; +	return 0; +} + +int trace_array_destroy(struct trace_array *tr) +{ +	int ret; + +	if (!tr) +		return -EINVAL; + +	mutex_lock(&event_mutex); +	mutex_lock(&trace_types_lock); + +	ret = __remove_instance(tr); + +	mutex_unlock(&trace_types_lock); +	mutex_unlock(&event_mutex); + +	return ret; +} +EXPORT_SYMBOL_GPL(trace_array_destroy); + +static int instance_rmdir(const char *name) +{ +	struct trace_array *tr; +	int ret; + +	mutex_lock(&event_mutex); +	mutex_lock(&trace_types_lock); + +	ret = -ENODEV; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) { +			ret = __remove_instance(tr); +			break; +		} +	} - out_unlock:  	mutex_unlock(&trace_types_lock);  	mutex_unlock(&event_mutex); @@ -8258,6 +8545,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)  			  tr, &snapshot_fops);  #endif +	trace_create_file("error_log", 0644, d_tracer, +			  tr, &tracing_err_log_fops); +  	for_each_tracing_cpu(cpu)  		tracing_init_tracefs_percpu(tr, cpu); @@ -8620,12 +8910,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)  		cnt++; -		/* reset all but tr, trace, and overruns */ -		memset(&iter.seq, 0, -		       sizeof(struct trace_iterator) - -		       offsetof(struct trace_iterator, seq)); +		trace_iterator_reset(&iter);  		iter.iter_flags |= TRACE_FILE_LAT_FMT; -		iter.pos = -1;  		if (trace_find_next_entry_inc(&iter) != NULL) {  			int ret; @@ -8843,6 +9129,7 @@ __init static int tracer_alloc_buffers(void)  	INIT_LIST_HEAD(&global_trace.systems);  	INIT_LIST_HEAD(&global_trace.events);  	INIT_LIST_HEAD(&global_trace.hist_vars); +	INIT_LIST_HEAD(&global_trace.err_log);  	list_add(&global_trace.list, &ftrace_trace_arrays);  	apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d80cee49e0eb..005f08629b8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,7 +15,6 @@  #include <linux/trace_seq.h>  #include <linux/trace_events.h>  #include <linux/compiler.h> -#include <linux/trace_seq.h>  #include <linux/glob.h>  #ifdef CONFIG_FTRACE_SYSCALLS @@ -293,11 +292,13 @@ struct trace_array {  	int			nr_topts;  	bool			clear_trace;  	int			buffer_percent; +	unsigned int		n_err_log_entries;  	struct tracer		*current_trace;  	unsigned int		trace_flags;  	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];  	unsigned int		flags;  	raw_spinlock_t		start_lock; +	struct list_head	err_log;  	struct dentry		*dir;  	struct dentry		*options;  	struct dentry		*percpu_dir; @@ -719,6 +720,9 @@ void trace_init_global_iter(struct trace_iterator *iter);  void tracing_iter_reset(struct trace_iterator *iter, int cpu); +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); +unsigned long trace_total_entries(struct trace_array *tr); +  void trace_function(struct trace_array *tr,  		    unsigned long ip,  		    unsigned long parent_ip, @@ -782,17 +786,9 @@ void update_max_tr_single(struct trace_array *tr,  #endif /* CONFIG_TRACER_MAX_TRACE */  #ifdef CONFIG_STACKTRACE -void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, -			    int pc); -  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc);  #else -static inline void ftrace_trace_userstack(struct ring_buffer *buffer, -					  unsigned long flags, int pc) -{ -} -  static inline void __trace_stack(struct trace_array *tr, unsigned long flags,  				 int skip, int pc)  { @@ -1553,7 +1549,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,  extern void print_subsystem_event_filter(struct event_subsystem *system,  					 struct trace_seq *s);  extern int filter_assign_type(const char *type); -extern int create_event_filter(struct trace_event_call *call, +extern int create_event_filter(struct trace_array *tr, +			       struct trace_event_call *call,  			       char *filter_str, bool set_str,  			       struct event_filter **filterp);  extern void free_event_filter(struct event_filter *filter); @@ -1884,6 +1881,11 @@ extern ssize_t trace_parse_run_command(struct file *file,  		const char __user *buffer, size_t count, loff_t *ppos,  		int (*createfn)(int, char**)); +extern unsigned int err_pos(char *cmd, const char *str); +extern void tracing_log_err(struct trace_array *tr, +			    const char *loc, const char *cmd, +			    const char **errs, u8 type, u8 pos); +  /*   * Normal trace_printk() and friends allocates special buffers   * to do the manipulation, as well as saves the print formats @@ -1964,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }  extern struct trace_iterator *tracepoint_print_iter; +/* + * Reset the state of the trace_iterator so that it can read consumed data. + * Normally, the trace_iterator is used for reading the data when it is not + * consumed, and must retain state. + */ +static __always_inline void trace_iterator_reset(struct trace_iterator *iter) +{ +	const size_t offset = offsetof(struct trace_iterator, seq); + +	/* +	 * Keep gcc from complaining about overwriting more than just one +	 * member in the structure. +	 */ +	memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); + +	iter->pos = -1; +} +  #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4ad967453b6f..3ea65cdff30d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)  void ftrace_likely_update(struct ftrace_likely_data *f, int val,  			  int expect, int is_constant)  { +	unsigned long flags = user_access_save(); +  	/* A constant is always correct */  	if (is_constant) {  		f->constant++; @@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,  		f->data.correct++;  	else  		f->data.incorrect++; + +	user_access_restore(flags);  }  EXPORT_SYMBOL(ftrace_likely_update); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dd1f43588d70..fa100ed3b4de 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)  static int create_dyn_event(int argc, char **argv)  {  	struct dyn_event_operations *ops; -	int ret; +	int ret = -ENODEV;  	if (argv[0][0] == '-' || argv[0][0] == '!')  		return dyn_event_release(argc, argv, NULL); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47..0ce3db67f556 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)  	return ret;  } +EXPORT_SYMBOL_GPL(ftrace_set_clr_event);  /**   * trace_set_clr_event - enable or disable an event @@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)  	char buf[32];  	int len; -	if (*ppos) -		return 0; -  	if (unlikely(!id))  		return -ENODEV; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 05a66493a164..5079d1db3754 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -66,7 +66,8 @@ static const char * ops[] = { OPS };  	C(INVALID_FILTER,	"Meaningless filter expression"),	\  	C(IP_FIELD_ONLY,	"Only 'ip' field is supported for function trace"), \  	C(INVALID_VALUE,	"Invalid value (did you forget quotes)?"), \ -	C(NO_FILTER,		"No filter found"), +	C(ERRNO,		"Error"),				\ +	C(NO_FILTER,		"No filter found")  #undef C  #define C(a, b)		FILT_ERR_##a @@ -76,7 +77,7 @@ enum { ERRORS };  #undef C  #define C(a, b)		b -static char *err_text[] = { ERRORS }; +static const char *err_text[] = { ERRORS };  /* Called after a '!' character but "!=" and "!~" are not "not"s */  static bool is_not(const char *str) @@ -427,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,  	op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);  	if (!op_stack)  		return ERR_PTR(-ENOMEM); -	prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); +	prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);  	if (!prog_stack) {  		parse_error(pe, -ENOMEM, 0);  		goto out_free; @@ -578,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,  out_free:  	kfree(op_stack);  	kfree(inverts); -	kfree(prog_stack); +	if (prog_stack) { +		for (i = 0; prog_stack[i].pred; i++) +			kfree(prog_stack[i].pred); +		kfree(prog_stack); +	}  	return ERR_PTR(ret);  } @@ -919,7 +924,8 @@ static void remove_filter_string(struct event_filter *filter)  	filter->filter_string = NULL;  } -static void append_filter_err(struct filter_parse_error *pe, +static void append_filter_err(struct trace_array *tr, +			      struct filter_parse_error *pe,  			      struct event_filter *filter)  {  	struct trace_seq *s; @@ -947,8 +953,14 @@ static void append_filter_err(struct filter_parse_error *pe,  	if (pe->lasterr > 0) {  		trace_seq_printf(s, "\n%*s", pos, "^");  		trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); +		tracing_log_err(tr, "event filter parse error", +				filter->filter_string, err_text, +				pe->lasterr, pe->lasterr_pos);  	} else {  		trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); +		tracing_log_err(tr, "event filter parse error", +				filter->filter_string, err_text, +				FILT_ERR_ERRNO, 0);  	}  	trace_seq_putc(s, 0);  	buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); @@ -1214,30 +1226,30 @@ static int parse_pred(const char *str, void *data,  		 * (perf doesn't use it) and grab everything.  		 */  		if (strcmp(field->name, "ip") != 0) { -			 parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); -			 goto err_free; -		 } -		 pred->fn = filter_pred_none; - -		 /* -		  * Quotes are not required, but if they exist then we need -		  * to read them till we hit a matching one. -		  */ -		 if (str[i] == '\'' || str[i] == '"') -			 q = str[i]; -		 else -			 q = 0; - -		 for (i++; str[i]; i++) { -			 if (q && str[i] == q) -				 break; -			 if (!q && (str[i] == ')' || str[i] == '&' || -				    str[i] == '|')) -				 break; -		 } -		 /* Skip quotes */ -		 if (q) -			 s++; +			parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); +			goto err_free; +		} +		pred->fn = filter_pred_none; + +		/* +		 * Quotes are not required, but if they exist then we need +		 * to read them till we hit a matching one. +		 */ +		if (str[i] == '\'' || str[i] == '"') +			q = str[i]; +		else +			q = 0; + +		for (i++; str[i]; i++) { +			if (q && str[i] == q) +				break; +			if (!q && (str[i] == ')' || str[i] == '&' || +				   str[i] == '|')) +				break; +		} +		/* Skip quotes */ +		if (q) +			s++;  		len = i - s;  		if (len >= MAX_FILTER_STR_VAL) {  			parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); @@ -1600,7 +1612,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,  		if (err) {  			filter_disable(file);  			parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); -			append_filter_err(pe, filter); +			append_filter_err(tr, pe, filter);  		} else  			event_set_filtered_flag(file); @@ -1712,7 +1724,8 @@ static void create_filter_finish(struct filter_parse_error *pe)   * information if @set_str is %true and the caller is responsible for   * freeing it.   */ -static int create_filter(struct trace_event_call *call, +static int create_filter(struct trace_array *tr, +			 struct trace_event_call *call,  			 char *filter_string, bool set_str,  			 struct event_filter **filterp)  { @@ -1729,17 +1742,18 @@ static int create_filter(struct trace_event_call *call,  	err = process_preds(call, filter_string, *filterp, pe);  	if (err && set_str) -		append_filter_err(pe, *filterp); +		append_filter_err(tr, pe, *filterp);  	create_filter_finish(pe);  	return err;  } -int create_event_filter(struct trace_event_call *call, +int create_event_filter(struct trace_array *tr, +			struct trace_event_call *call,  			char *filter_str, bool set_str,  			struct event_filter **filterp)  { -	return create_filter(call, filter_str, set_str, filterp); +	return create_filter(tr, call, filter_str, set_str, filterp);  }  /** @@ -1766,7 +1780,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,  			kfree((*filterp)->filter_string);  			(*filterp)->filter_string = NULL;  		} else { -			append_filter_err(pe, *filterp); +			append_filter_err(tr, pe, *filterp);  		}  	}  	create_filter_finish(pe); @@ -1797,7 +1811,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)  		return 0;  	} -	err = create_filter(call, filter_string, true, &filter); +	err = create_filter(file->tr, call, filter_string, true, &filter);  	/*  	 * Always swap the call filter with the new filter @@ -2053,7 +2067,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,  	if (event->filter)  		goto out_unlock; -	err = create_filter(call, filter_str, false, &filter); +	err = create_filter(NULL, call, filter_str, false, &filter);  	if (err)  		goto free_filter; @@ -2202,8 +2216,8 @@ static __init int ftrace_test_event_filter(void)  		struct test_filter_data_t *d = &test_filter_data[i];  		int err; -		err = create_filter(&event_ftrace_test_filter, d->filter, -				    false, &filter); +		err = create_filter(NULL, &event_ftrace_test_filter, +				    d->filter, false, &filter);  		if (err) {  			printk(KERN_INFO  			       "Failed to get filter for '%s', err %d\n", diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca46339f3009..ca6b0dff60c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -22,6 +22,57 @@  #define STR_VAR_LEN_MAX		32 /* must be multiple of sizeof(u64) */ +#define ERRORS								\ +	C(NONE,			"No error"),				\ +	C(DUPLICATE_VAR,	"Variable already defined"),		\ +	C(VAR_NOT_UNIQUE,	"Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ +	C(TOO_MANY_VARS,	"Too many variables defined"),		\ +	C(MALFORMED_ASSIGNMENT,	"Malformed assignment"),		\ +	C(NAMED_MISMATCH,	"Named hist trigger doesn't match existing named trigger (includes variables)"), \ +	C(TRIGGER_EEXIST,	"Hist trigger already exists"),		\ +	C(TRIGGER_ENOENT_CLEAR,	"Can't clear or continue a nonexistent hist trigger"), \ +	C(SET_CLOCK_FAIL,	"Couldn't set trace_clock"),		\ +	C(BAD_FIELD_MODIFIER,	"Invalid field modifier"),		\ +	C(TOO_MANY_SUBEXPR,	"Too many subexpressions (3 max)"),	\ +	C(TIMESTAMP_MISMATCH,	"Timestamp units in expression don't match"), \ +	C(TOO_MANY_FIELD_VARS,	"Too many field variables defined"),	\ +	C(EVENT_FILE_NOT_FOUND,	"Event file not found"),		\ +	C(HIST_NOT_FOUND,	"Matching event histogram not found"),	\ +	C(HIST_CREATE_FAIL,	"Couldn't create histogram for field"),	\ +	C(SYNTH_VAR_NOT_FOUND,	"Couldn't find synthetic variable"),	\ +	C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"),	\ +	C(SYNTH_TYPE_MISMATCH,	"Param type doesn't match synthetic event field type"), \ +	C(SYNTH_COUNT_MISMATCH,	"Param count doesn't match synthetic event field count"), \ +	C(FIELD_VAR_PARSE_FAIL,	"Couldn't parse field variable"),	\ +	C(VAR_CREATE_FIND_FAIL,	"Couldn't create or find variable"),	\ +	C(ONX_NOT_VAR,		"For onmax(x) or onchange(x), x must be a variable"), \ +	C(ONX_VAR_NOT_FOUND,	"Couldn't find onmax or onchange variable"), \ +	C(ONX_VAR_CREATE_FAIL,	"Couldn't create onmax or onchange variable"), \ +	C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"),	\ +	C(TOO_MANY_PARAMS,	"Too many action params"),		\ +	C(PARAM_NOT_FOUND,	"Couldn't find param"),			\ +	C(INVALID_PARAM,	"Invalid action param"),		\ +	C(ACTION_NOT_FOUND,	"No action found"),			\ +	C(NO_SAVE_PARAMS,	"No params found for save()"),		\ +	C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ +	C(ACTION_MISMATCH,	"Handler doesn't support action"),	\ +	C(NO_CLOSING_PAREN,	"No closing paren found"),		\ +	C(SUBSYS_NOT_FOUND,	"Missing subsystem"),			\ +	C(INVALID_SUBSYS_EVENT,	"Invalid subsystem or event name"),	\ +	C(INVALID_REF_KEY,	"Using variable references in keys not supported"), \ +	C(VAR_NOT_FOUND,	"Couldn't find variable"),		\ +	C(FIELD_NOT_FOUND,	"Couldn't find field"), + +#undef C +#define C(a, b)		HIST_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b)		b + +static const char *err_text[] = { ERRORS }; +  struct hist_field;  typedef u64 (*hist_field_fn_t) (struct hist_field *field, @@ -535,62 +586,49 @@ static struct track_data *track_data_alloc(unsigned int key_len,  	return data;  } -static char last_hist_cmd[MAX_FILTER_STR_VAL]; -static char hist_err_str[MAX_FILTER_STR_VAL]; +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; -static void last_cmd_set(char *str) +static int errpos(char *str)  { -	if (!str) -		return; - -	strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); +	return err_pos(last_cmd, str);  } -static void hist_err(char *str, char *var) +static void last_cmd_set(struct trace_event_file *file, char *str)  { -	int maxlen = MAX_FILTER_STR_VAL - 1; +	const char *system = NULL, *name = NULL; +	struct trace_event_call *call;  	if (!str)  		return; -	if (strlen(hist_err_str)) -		return; +	strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); -	if (!var) -		var = ""; +	if (file) { +		call = file->event_call; -	if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) -		return; +		system = call->class->system; +		if (system) { +			name = trace_event_name(call); +			if (!name) +				system = NULL; +		} +	} -	strcat(hist_err_str, str); -	strcat(hist_err_str, var); +	if (system) +		snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);  } -static void hist_err_event(char *str, char *system, char *event, char *var) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)  { -	char err[MAX_FILTER_STR_VAL]; - -	if (system && var) -		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); -	else if (system) -		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); -	else -		strscpy(err, var, MAX_FILTER_STR_VAL); - -	hist_err(str, err); +	tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, +			err_type, err_pos);  }  static void hist_err_clear(void)  { -	hist_err_str[0] = '\0'; -} - -static bool have_hist_err(void) -{ -	if (strlen(hist_err_str)) -		return true; - -	return false; +	last_cmd[0] = '\0'; +	last_cmd_loc[0] = '\0';  }  struct synth_trace_event { @@ -1719,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr,  		if (find_var_field(var_hist_data, var_name)) {  			if (found) { -				hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); +				hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));  				return NULL;  			} @@ -1770,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)  			hist_field = find_file_var(file, var_name);  			if (hist_field) {  				if (found) { -					hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); +					hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, +						 errpos(var_name));  					return ERR_PTR(-EINVAL);  				} @@ -1815,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field,  	struct hist_elt_data *elt_data;  	u64 var_val = 0; +	if (WARN_ON_ONCE(!elt)) +		return var_val; +  	elt_data = elt->private_data;  	var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; @@ -2002,11 +2044,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)  		attrs->n_actions++;  		ret = 0;  	} -  	return ret;  } -static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +static int parse_assignment(struct trace_array *tr, +			    char *str, struct hist_trigger_attrs *attrs)  {  	int ret = 0; @@ -2062,7 +2104,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)  		char *assignment;  		if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { -			hist_err("Too many variables defined: ", str); +			hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str));  			ret = -EINVAL;  			goto out;  		} @@ -2079,7 +2121,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)  	return ret;  } -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +static struct hist_trigger_attrs * +parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)  {  	struct hist_trigger_attrs *attrs;  	int ret = 0; @@ -2092,7 +2135,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)  		char *str = strsep(&trigger_str, ":");  		if (strchr(str, '=')) { -			ret = parse_assignment(str, attrs); +			ret = parse_assignment(tr, str, attrs);  			if (ret)  				goto free;  		} else if (strcmp(str, "pause") == 0) @@ -2648,6 +2691,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,  					char *var_name)  {  	struct hist_field *var_field = NULL, *ref_field = NULL; +	struct trace_array *tr = hist_data->event_file->tr;  	if (!is_var_ref(var_name))  		return NULL; @@ -2660,8 +2704,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,  					   system, event_name);  	if (!ref_field) -		hist_err_event("Couldn't find variable: $", -			       system, event_name, var_name); +		hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name));  	return ref_field;  } @@ -2672,6 +2715,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,  {  	struct ftrace_event_field *field = NULL;  	char *field_name, *modifier, *str; +	struct trace_array *tr = file->tr;  	modifier = str = kstrdup(field_str, GFP_KERNEL);  	if (!modifier) @@ -2695,7 +2739,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,  		else if (strcmp(modifier, "usecs") == 0)  			*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;  		else { -			hist_err("Invalid field modifier: ", modifier); +			hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));  			field = ERR_PTR(-EINVAL);  			goto out;  		} @@ -2711,7 +2755,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,  	else {  		field = trace_find_event_field(file->event_call, field_name);  		if (!field || !field->size) { -			hist_err("Couldn't find field: ", field_name); +			hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));  			field = ERR_PTR(-EINVAL);  			goto out;  		} @@ -2773,7 +2817,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,  	s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);  	if (!s) { -		hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); +		hist_field = parse_var_ref(hist_data, ref_system, +					   ref_event, ref_var);  		if (hist_field) {  			if (var_name) {  				hist_field = create_alias(hist_data, hist_field, var_name); @@ -2822,7 +2867,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,  	/* we support only -(xxx) i.e. explicit parens required */  	if (level > 3) { -		hist_err("Too many subexpressions (3 max): ", str); +		hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));  		ret = -EINVAL;  		goto free;  	} @@ -2877,7 +2922,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,  	return ERR_PTR(ret);  } -static int check_expr_operands(struct hist_field *operand1, +static int check_expr_operands(struct trace_array *tr, +			       struct hist_field *operand1,  			       struct hist_field *operand2)  {  	unsigned long operand1_flags = operand1->flags; @@ -2905,7 +2951,7 @@ static int check_expr_operands(struct hist_field *operand1,  	if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=  	    (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { -		hist_err("Timestamp units in expression don't match", NULL); +		hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0);  		return -EINVAL;  	} @@ -2923,7 +2969,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,  	char *sep, *operand1_str;  	if (level > 3) { -		hist_err("Too many subexpressions (3 max): ", str); +		hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));  		return ERR_PTR(-EINVAL);  	} @@ -2968,7 +3014,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,  		goto free;  	} -	ret = check_expr_operands(operand1, operand2); +	ret = check_expr_operands(file->tr, operand1, operand2);  	if (ret)  		goto free; @@ -3161,16 +3207,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,  	int ret;  	if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { -		hist_err_event("trace action: Too many field variables defined: ", -			       subsys_name, event_name, field_name); +		hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));  		return ERR_PTR(-EINVAL);  	}  	file = event_file(tr, subsys_name, event_name);  	if (IS_ERR(file)) { -		hist_err_event("trace action: Event file not found: ", -			       subsys_name, event_name, field_name); +		hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));  		ret = PTR_ERR(file);  		return ERR_PTR(ret);  	} @@ -3183,8 +3227,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,  	 */  	hist_data = find_compatible_hist(target_hist_data, file);  	if (!hist_data) { -		hist_err_event("trace action: Matching event histogram not found: ", -			       subsys_name, event_name, field_name); +		hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name));  		return ERR_PTR(-EINVAL);  	} @@ -3245,8 +3288,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,  		kfree(cmd);  		kfree(var_hist->cmd);  		kfree(var_hist); -		hist_err_event("trace action: Couldn't create histogram for field: ", -			       subsys_name, event_name, field_name); +		hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));  		return ERR_PTR(ret);  	} @@ -3258,8 +3300,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,  	if (IS_ERR_OR_NULL(event_var)) {  		kfree(var_hist->cmd);  		kfree(var_hist); -		hist_err_event("trace action: Couldn't find synthetic variable: ", -			       subsys_name, event_name, field_name); +		hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));  		return ERR_PTR(-EINVAL);  	} @@ -3392,25 +3433,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,  {  	struct hist_field *val = NULL, *var = NULL;  	unsigned long flags = HIST_FIELD_FL_VAR; +	struct trace_array *tr = file->tr;  	struct field_var *field_var;  	int ret = 0;  	if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { -		hist_err("Too many field variables defined: ", field_name); +		hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));  		ret = -EINVAL;  		goto err;  	}  	val = parse_atom(hist_data, file, field_name, &flags, NULL);  	if (IS_ERR(val)) { -		hist_err("Couldn't parse field variable: ", field_name); +		hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));  		ret = PTR_ERR(val);  		goto err;  	}  	var = create_var(hist_data, file, field_name, val->size, val->type);  	if (IS_ERR(var)) { -		hist_err("Couldn't create or find variable: ", field_name); +		hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));  		kfree(val);  		ret = PTR_ERR(var);  		goto err; @@ -3543,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)  	struct track_data *track_data = tr->cond_snapshot->cond_data;  	struct hist_elt_data *elt_data, *track_elt_data;  	struct snapshot_context *context = cond_data; +	struct action_data *action;  	u64 track_val;  	if (!track_data)  		return false; +	action = track_data->action_data; +  	track_val = get_track_val(track_data->hist_data, context->elt,  				  track_data->action_data); +	if (!action->track_data.check_val(track_data->track_val, track_val)) +		return false; +  	track_data->track_val = track_val;  	memcpy(track_data->key, context->key, track_data->key_len); @@ -3713,7 +3761,6 @@ static void track_data_destroy(struct hist_trigger_data *hist_data,  	struct trace_event_file *file = hist_data->event_file;  	destroy_hist_field(data->track_data.track_var, 0); -	destroy_hist_field(data->track_data.var_ref, 0);  	if (data->action == ACTION_SNAPSHOT) {  		struct track_data *track_data; @@ -3738,19 +3785,20 @@ static int track_data_create(struct hist_trigger_data *hist_data,  {  	struct hist_field *var_field, *ref_field, *track_var = NULL;  	struct trace_event_file *file = hist_data->event_file; +	struct trace_array *tr = file->tr;  	char *track_data_var_str;  	int ret = 0;  	track_data_var_str = data->track_data.var_str;  	if (track_data_var_str[0] != '$') { -		hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); +		hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));  		return -EINVAL;  	}  	track_data_var_str++;  	var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);  	if (!var_field) { -		hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); +		hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));  		return -EINVAL;  	} @@ -3763,7 +3811,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,  	if (data->handler == HANDLER_ONMAX)  		track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");  	if (IS_ERR(track_var)) { -		hist_err("Couldn't create onmax variable: ", "__max"); +		hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);  		ret = PTR_ERR(track_var);  		goto out;  	} @@ -3771,7 +3819,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,  	if (data->handler == HANDLER_ONCHANGE)  		track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");  	if (IS_ERR(track_var)) { -		hist_err("Couldn't create onchange variable: ", "__change"); +		hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);  		ret = PTR_ERR(track_var);  		goto out;  	} @@ -3782,7 +3830,8 @@ static int track_data_create(struct hist_trigger_data *hist_data,  	return ret;  } -static int parse_action_params(char *params, struct action_data *data) +static int parse_action_params(struct trace_array *tr, char *params, +			       struct action_data *data)  {  	char *param, *saved_param;  	bool first_param = true; @@ -3790,20 +3839,20 @@ static int parse_action_params(char *params, struct action_data *data)  	while (params) {  		if (data->n_params >= SYNTH_FIELDS_MAX) { -			hist_err("Too many action params", ""); +			hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);  			goto out;  		}  		param = strsep(¶ms, ",");  		if (!param) { -			hist_err("No action param found", ""); +			hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0);  			ret = -EINVAL;  			goto out;  		}  		param = strstrip(param);  		if (strlen(param) < 2) { -			hist_err("Invalid action param: ", param); +			hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param));  			ret = -EINVAL;  			goto out;  		} @@ -3827,7 +3876,7 @@ static int parse_action_params(char *params, struct action_data *data)  	return ret;  } -static int action_parse(char *str, struct action_data *data, +static int action_parse(struct trace_array *tr, char *str, struct action_data *data,  			enum handler_id handler)  {  	char *action_name; @@ -3835,14 +3884,14 @@ static int action_parse(char *str, struct action_data *data,  	strsep(&str, ".");  	if (!str) { -		hist_err("action parsing: No action found", ""); +		hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);  		ret = -EINVAL;  		goto out;  	}  	action_name = strsep(&str, "(");  	if (!action_name || !str) { -		hist_err("action parsing: No action found", ""); +		hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);  		ret = -EINVAL;  		goto out;  	} @@ -3851,12 +3900,12 @@ static int action_parse(char *str, struct action_data *data,  		char *params = strsep(&str, ")");  		if (!params) { -			hist_err("action parsing: No params found for %s", "save"); +			hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0);  			ret = -EINVAL;  			goto out;  		} -		ret = parse_action_params(params, data); +		ret = parse_action_params(tr, params, data);  		if (ret)  			goto out; @@ -3865,7 +3914,7 @@ static int action_parse(char *str, struct action_data *data,  		else if (handler == HANDLER_ONCHANGE)  			data->track_data.check_val = check_track_val_changed;  		else { -			hist_err("action parsing: Handler doesn't support action: ", action_name); +			hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));  			ret = -EINVAL;  			goto out;  		} @@ -3877,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data,  		char *params = strsep(&str, ")");  		if (!str) { -			hist_err("action parsing: No closing paren found: %s", params); +			hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params));  			ret = -EINVAL;  			goto out;  		} @@ -3887,7 +3936,7 @@ static int action_parse(char *str, struct action_data *data,  		else if (handler == HANDLER_ONCHANGE)  			data->track_data.check_val = check_track_val_changed;  		else { -			hist_err("action parsing: Handler doesn't support action: ", action_name); +			hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));  			ret = -EINVAL;  			goto out;  		} @@ -3902,7 +3951,7 @@ static int action_parse(char *str, struct action_data *data,  			data->use_trace_keyword = true;  		if (params) { -			ret = parse_action_params(params, data); +			ret = parse_action_params(tr, params, data);  			if (ret)  				goto out;  		} @@ -3955,7 +4004,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,  		goto free;  	} -	ret = action_parse(str, data, handler); +	ret = action_parse(hist_data->event_file->tr, str, data, handler);  	if (ret)  		goto free;   out: @@ -4025,6 +4074,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,  		      struct action_data *data,  		      char *system, char *event, char *var)  { +	struct trace_array *tr = hist_data->event_file->tr;  	struct hist_field *hist_field;  	var++; /* skip '$' */ @@ -4040,7 +4090,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,  	}  	if (!hist_field) -		hist_err_event("trace action: Couldn't find param: $", system, event, var); +		hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var));  	return hist_field;  } @@ -4098,6 +4148,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,  static int trace_action_create(struct hist_trigger_data *hist_data,  			       struct action_data *data)  { +	struct trace_array *tr = hist_data->event_file->tr;  	char *event_name, *param, *system = NULL;  	struct hist_field *hist_field, *var_ref;  	unsigned int i, var_ref_idx; @@ -4115,7 +4166,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,  	event = find_synth_event(synth_event_name);  	if (!event) { -		hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); +		hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));  		return -EINVAL;  	} @@ -4176,15 +4227,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,  			continue;  		} -		hist_err_event("trace action: Param type doesn't match synthetic event field type: ", -			       system, event_name, param); +		hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));  		kfree(p);  		ret = -EINVAL;  		goto err;  	}  	if (field_pos != event->n_fields) { -		hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); +		hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));  		ret = -EINVAL;  		goto err;  	} @@ -4203,6 +4253,7 @@ static int action_create(struct hist_trigger_data *hist_data,  			 struct action_data *data)  {  	struct trace_event_file *file = hist_data->event_file; +	struct trace_array *tr = file->tr;  	struct track_data *track_data;  	struct field_var *field_var;  	unsigned int i; @@ -4230,7 +4281,7 @@ static int action_create(struct hist_trigger_data *hist_data,  	if (data->action == ACTION_SAVE) {  		if (hist_data->n_save_vars) {  			ret = -EEXIST; -			hist_err("save action: Can't have more than one save() action per hist", ""); +			hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);  			goto out;  		} @@ -4243,7 +4294,8 @@ static int action_create(struct hist_trigger_data *hist_data,  			field_var = create_target_field_var(hist_data, NULL, NULL, param);  			if (IS_ERR(field_var)) { -				hist_err("save action: Couldn't create field variable: ", param); +				hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, +					 errpos(param));  				ret = PTR_ERR(field_var);  				kfree(param);  				goto out; @@ -4277,19 +4329,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)  	match_event = strsep(&str, ")");  	if (!match_event || !str) { -		hist_err("onmatch: Missing closing paren: ", match_event); +		hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));  		goto free;  	}  	match_event_system = strsep(&match_event, ".");  	if (!match_event) { -		hist_err("onmatch: Missing subsystem for match event: ", match_event_system); +		hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));  		goto free;  	}  	if (IS_ERR(event_file(tr, match_event_system, match_event))) { -		hist_err_event("onmatch: Invalid subsystem or event name: ", -			       match_event_system, match_event, NULL); +		hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));  		goto free;  	} @@ -4305,7 +4356,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)  		goto free;  	} -	ret = action_parse(str, data, HANDLER_ONMATCH); +	ret = action_parse(tr, str, data, HANDLER_ONMATCH);  	if (ret)  		goto free;   out: @@ -4374,13 +4425,14 @@ static int create_var_field(struct hist_trigger_data *hist_data,  			    struct trace_event_file *file,  			    char *var_name, char *expr_str)  { +	struct trace_array *tr = hist_data->event_file->tr;  	unsigned long flags = 0;  	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))  		return -EINVAL;  	if (find_var(hist_data, file, var_name) && !hist_data->remove) { -		hist_err("Variable already defined: ", var_name); +		hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name));  		return -EINVAL;  	} @@ -4437,8 +4489,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,  			    struct trace_event_file *file,  			    char *field_str)  { +	struct trace_array *tr = hist_data->event_file->tr;  	struct hist_field *hist_field = NULL; -  	unsigned long flags = 0;  	unsigned int key_size;  	int ret = 0; @@ -4460,8 +4512,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,  			goto out;  		} -		if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { -			hist_err("Using variable references as keys not supported: ", field_str); +		if (field_has_hist_vars(hist_field, 0))	{ +			hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));  			destroy_hist_field(hist_field, 0);  			ret = -EINVAL;  			goto out; @@ -4562,6 +4614,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data)  static int parse_var_defs(struct hist_trigger_data *hist_data)  { +	struct trace_array *tr = hist_data->event_file->tr;  	char *s, *str, *var_name, *field_str;  	unsigned int i, j, n_vars = 0;  	int ret = 0; @@ -4575,13 +4628,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)  			var_name = strsep(&field_str, "=");  			if (!var_name || !field_str) { -				hist_err("Malformed assignment: ", var_name); +				hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, +					 errpos(var_name));  				ret = -EINVAL;  				goto free;  			}  			if (n_vars == TRACING_MAP_VARS_MAX) { -				hist_err("Too many variables defined: ", var_name); +				hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name));  				ret = -EINVAL;  				goto free;  			} @@ -5187,7 +5241,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,  	u64 var_ref_vals[TRACING_MAP_VARS_MAX];  	char compound_key[HIST_KEY_SIZE_MAX];  	struct tracing_map_elt *elt = NULL; -	struct stack_trace stacktrace;  	struct hist_field *key_field;  	u64 field_contents;  	void *key = NULL; @@ -5199,14 +5252,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,  		key_field = hist_data->fields[i];  		if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { -			stacktrace.max_entries = HIST_STACKTRACE_DEPTH; -			stacktrace.entries = entries; -			stacktrace.nr_entries = 0; -			stacktrace.skip = HIST_STACKTRACE_SKIP; - -			memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); -			save_stack_trace(&stacktrace); - +			memset(entries, 0, HIST_STACKTRACE_SIZE); +			stack_trace_save(entries, HIST_STACKTRACE_DEPTH, +					 HIST_STACKTRACE_SKIP);  			key = entries;  		} else {  			field_contents = key_field->fn(key_field, elt, rbe, rec); @@ -5247,7 +5295,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,  	unsigned int i;  	for (i = 0; i < max_entries; i++) { -		if (stacktrace_entries[i] == ULONG_MAX) +		if (!stacktrace_entries[i])  			return;  		seq_printf(m, "%*c", 1 + spaces, ' '); @@ -5438,11 +5486,6 @@ static int hist_show(struct seq_file *m, void *v)  			hist_trigger_show(m, data, n++);  	} -	if (have_hist_err()) { -		seq_printf(m, "\nERROR: %s\n", hist_err_str); -		seq_printf(m, "  Last command: %s\n", last_hist_cmd); -	} -   out_unlock:  	mutex_unlock(&event_mutex); @@ -5807,6 +5850,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,  {  	struct hist_trigger_data *hist_data = data->private_data;  	struct event_trigger_data *test, *named_data = NULL; +	struct trace_array *tr = file->tr;  	int ret = 0;  	if (hist_data->attrs->name) { @@ -5814,7 +5858,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,  		if (named_data) {  			if (!hist_trigger_match(data, named_data, named_data,  						true)) { -				hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); +				hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));  				ret = -EINVAL;  				goto out;  			} @@ -5835,7 +5879,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,  			else if (hist_data->attrs->clear)  				hist_clear(test);  			else { -				hist_err("Hist trigger already exists", NULL); +				hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);  				ret = -EEXIST;  			}  			goto out; @@ -5843,7 +5887,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,  	}   new:  	if (hist_data->attrs->cont || hist_data->attrs->clear) { -		hist_err("Can't clear or continue a nonexistent hist trigger", NULL); +		hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);  		ret = -ENOENT;  		goto out;  	} @@ -5868,7 +5912,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,  		ret = tracing_set_clock(file->tr, hist_data->attrs->clock);  		if (ret) { -			hist_err("Couldn't set trace_clock: ", clock); +			hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock));  			goto out;  		} @@ -6044,8 +6088,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,  	lockdep_assert_held(&event_mutex);  	if (glob && strlen(glob)) { -		last_cmd_set(param);  		hist_err_clear(); +		last_cmd_set(file, param);  	}  	if (!param) @@ -6086,7 +6130,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,  		trigger = strstrip(trigger);  	} -	attrs = parse_hist_trigger_attrs(trigger); +	attrs = parse_hist_trigger_attrs(file->tr, trigger);  	if (IS_ERR(attrs))  		return PTR_ERR(attrs); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cd12ecb66eb9..2a2912cb4533 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str,  		goto out;  	/* The filter is for the 'trigger' event, not the triggered event */ -	ret = create_event_filter(file->event_call, filter_str, false, &filter); +	ret = create_event_filter(file->tr, file->event_call, +				  filter_str, false, &filter);  	/*  	 * If create_event_filter() fails, filter still needs to be freed.  	 * Which the calling code will do with data->filter. diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 810d78a8d14c..cca65044c14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,36 +17,28 @@  #include "trace.h"  #include "trace_output.h" -static void ftrace_dump_buf(int skip_lines, long cpu_file) +static struct trace_iterator iter; +static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + +static void ftrace_dump_buf(int skip_entries, long cpu_file)  { -	/* use static because iter can be a bit big for the stack */ -	static struct trace_iterator iter; -	static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];  	struct trace_array *tr;  	unsigned int old_userobj;  	int cnt = 0, cpu; -	trace_init_global_iter(&iter); -	iter.buffer_iter = buffer_iter;  	tr = iter.tr; -	for_each_tracing_cpu(cpu) { -		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); -	} -  	old_userobj = tr->trace_flags;  	/* don't look at user memory in panic mode */  	tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;  	kdb_printf("Dumping ftrace buffer:\n"); +	if (skip_entries) +		kdb_printf("(skipping %d entries)\n", skip_entries); -	/* reset all but tr, trace, and overruns */ -	memset(&iter.seq, 0, -		   sizeof(struct trace_iterator) - -		   offsetof(struct trace_iterator, seq)); +	trace_iterator_reset(&iter);  	iter.iter_flags |= TRACE_FILE_LAT_FMT; -	iter.pos = -1;  	if (cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) { @@ -70,11 +62,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  			kdb_printf("---------------------------------\n");  		cnt++; -		if (!skip_lines) { +		if (!skip_entries) {  			print_trace_line(&iter);  			trace_printk_seq(&iter.seq);  		} else { -			skip_lines--; +			skip_entries--;  		}  		if (KDB_FLAG(CMD_INTERRUPT)) @@ -90,10 +82,6 @@ out:  	tr->trace_flags = old_userobj;  	for_each_tracing_cpu(cpu) { -		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); -	} - -	for_each_tracing_cpu(cpu) {  		if (iter.buffer_iter[cpu]) {  			ring_buffer_read_finish(iter.buffer_iter[cpu]);  			iter.buffer_iter[cpu] = NULL; @@ -106,17 +94,19 @@ out:   */  static int kdb_ftdump(int argc, const char **argv)  { -	int skip_lines = 0; +	int skip_entries = 0;  	long cpu_file;  	char *cp; +	int cnt; +	int cpu;  	if (argc > 2)  		return KDB_ARGCOUNT;  	if (argc) { -		skip_lines = simple_strtol(argv[1], &cp, 0); +		skip_entries = simple_strtol(argv[1], &cp, 0);  		if (*cp) -			skip_lines = 0; +			skip_entries = 0;  	}  	if (argc == 2) { @@ -129,7 +119,29 @@ static int kdb_ftdump(int argc, const char **argv)  	}  	kdb_trap_printk++; -	ftrace_dump_buf(skip_lines, cpu_file); + +	trace_init_global_iter(&iter); +	iter.buffer_iter = buffer_iter; + +	for_each_tracing_cpu(cpu) { +		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); +	} + +	/* A negative skip_entries means skip all but the last entries */ +	if (skip_entries < 0) { +		if (cpu_file == RING_BUFFER_ALL_CPUS) +			cnt = trace_total_entries(NULL); +		else +			cnt = trace_total_entries_cpu(NULL, cpu_file); +		skip_entries = max(cnt + skip_entries, 0); +	} + +	ftrace_dump_buf(skip_entries, cpu_file); + +	for_each_tracing_cpu(cpu) { +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); +	} +  	kdb_trap_printk--;  	return 0; @@ -137,8 +149,9 @@ static int kdb_ftdump(int argc, const char **argv)  static __init int kdb_ftrace_register(void)  { -	kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", -			    "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); +	kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", +			    "Dump ftrace log; -skip dumps last #entries", 0, +			    KDB_ENABLE_ALWAYS_SAFE);  	return 0;  } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5d5129b05df7..7d736248a070 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)  	else  		ret = register_kprobe(&tk->rp.kp); -	if (ret == 0) { +	if (ret == 0)  		tk->tp.flags |= TP_FLAG_REGISTERED; -	} else if (ret == -EILSEQ) { -		pr_warn("Probing address(0x%p) is not an instruction boundary.\n", -			tk->rp.kp.addr); -		ret = -EINVAL; -	}  	return ret;  } @@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[])  	 * Type of args:  	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.  	 */ -	struct trace_kprobe *tk; +	struct trace_kprobe *tk = NULL;  	int i, len, ret = 0;  	bool is_return = false;  	char *symbol = NULL, *tmp = NULL; @@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[])  	if (argc < 2)  		return -ECANCELED; +	trace_probe_log_init("trace_kprobe", argc, argv); +  	event = strchr(&argv[0][1], ':');  	if (event)  		event++;  	if (isdigit(argv[0][1])) {  		if (!is_return) { -			pr_info("Maxactive is not for kprobe"); -			return -EINVAL; +			trace_probe_log_err(1, MAXACT_NO_KPROBE); +			goto parse_error;  		}  		if (event)  			len = event - &argv[0][1] - 1;  		else  			len = strlen(&argv[0][1]); -		if (len > MAX_EVENT_NAME_LEN - 1) -			return -E2BIG; +		if (len > MAX_EVENT_NAME_LEN - 1) { +			trace_probe_log_err(1, BAD_MAXACT); +			goto parse_error; +		}  		memcpy(buf, &argv[0][1], len);  		buf[len] = '\0';  		ret = kstrtouint(buf, 0, &maxactive);  		if (ret || !maxactive) { -			pr_info("Invalid maxactive number\n"); -			return ret; +			trace_probe_log_err(1, BAD_MAXACT); +			goto parse_error;  		}  		/* kretprobes instances are iterated over via a list. The  		 * maximum should stay reasonable.  		 */  		if (maxactive > KRETPROBE_MAXACTIVE_MAX) { -			pr_info("Maxactive is too big (%d > %d).\n", -				maxactive, KRETPROBE_MAXACTIVE_MAX); -			return -E2BIG; +			trace_probe_log_err(1, MAXACT_TOO_BIG); +			goto parse_error;  		}  	}  	/* try to parse an address. if that fails, try to read the  	 * input as a symbol. */  	if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { +		trace_probe_log_set_index(1);  		/* Check whether uprobe event specified */ -		if (strchr(argv[1], '/') && strchr(argv[1], ':')) -			return -ECANCELED; +		if (strchr(argv[1], '/') && strchr(argv[1], ':')) { +			ret = -ECANCELED; +			goto error; +		}  		/* a symbol specified */  		symbol = kstrdup(argv[1], GFP_KERNEL);  		if (!symbol) @@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[])  		/* TODO: support .init module functions */  		ret = traceprobe_split_symbol_offset(symbol, &offset);  		if (ret || offset < 0 || offset > UINT_MAX) { -			pr_info("Failed to parse either an address or a symbol.\n"); -			goto out; +			trace_probe_log_err(0, BAD_PROBE_ADDR); +			goto parse_error;  		}  		if (kprobe_on_func_entry(NULL, symbol, offset))  			flags |= TPARG_FL_FENTRY;  		if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { -			pr_info("Given offset is not valid for return probe.\n"); -			ret = -EINVAL; -			goto out; +			trace_probe_log_err(0, BAD_RETPROBE); +			goto parse_error;  		}  	} -	argc -= 2; argv += 2; +	trace_probe_log_set_index(0);  	if (event) { -		ret = traceprobe_parse_event_name(&event, &group, buf); +		ret = traceprobe_parse_event_name(&event, &group, buf, +						  event - argv[0]);  		if (ret) -			goto out; +			goto parse_error;  	} else {  		/* Make a new event name */  		if (symbol) @@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[])  	/* setup a probe */  	tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, -			       argc, is_return); +			       argc - 2, is_return);  	if (IS_ERR(tk)) {  		ret = PTR_ERR(tk); -		/* This must return -ENOMEM otherwise there is a bug */ +		/* This must return -ENOMEM, else there is a bug */  		WARN_ON_ONCE(ret != -ENOMEM); -		goto out; +		goto out;	/* We know tk is not allocated */  	} +	argc -= 2; argv += 2;  	/* parse arguments */  	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { @@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[])  			goto error;  		} +		trace_probe_log_set_index(i + 2);  		ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);  		kfree(tmp);  		if (ret) -			goto error; +			goto error;	/* This can be -ENOMEM */  	}  	ret = register_trace_kprobe(tk); -	if (ret) +	if (ret) { +		trace_probe_log_set_index(1); +		if (ret == -EILSEQ) +			trace_probe_log_err(0, BAD_INSN_BNDRY); +		else if (ret == -ENOENT) +			trace_probe_log_err(0, BAD_PROBE_ADDR); +		else if (ret != -ENOMEM) +			trace_probe_log_err(0, FAIL_REG_PROBE);  		goto error; +	} +  out: +	trace_probe_log_clear();  	kfree(symbol);  	return ret; +parse_error: +	ret = -EINVAL;  error:  	free_trace_kprobe(tk);  	goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f8411e7835f..a347faced959 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,6 +13,11 @@  #include "trace_probe.h" +#undef C +#define C(a, b)		b + +static const char *trace_probe_err_text[] = { ERRORS }; +  static const char *reserved_field_names[] = {  	"common_type",  	"common_flags", @@ -133,6 +138,60 @@ fail:  	return NULL;  } +static struct trace_probe_log trace_probe_log; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +{ +	trace_probe_log.subsystem = subsystem; +	trace_probe_log.argc = argc; +	trace_probe_log.argv = argv; +	trace_probe_log.index = 0; +} + +void trace_probe_log_clear(void) +{ +	memset(&trace_probe_log, 0, sizeof(trace_probe_log)); +} + +void trace_probe_log_set_index(int index) +{ +	trace_probe_log.index = index; +} + +void __trace_probe_log_err(int offset, int err_type) +{ +	char *command, *p; +	int i, len = 0, pos = 0; + +	if (!trace_probe_log.argv) +		return; + +	/* Recalcurate the length and allocate buffer */ +	for (i = 0; i < trace_probe_log.argc; i++) { +		if (i == trace_probe_log.index) +			pos = len; +		len += strlen(trace_probe_log.argv[i]) + 1; +	} +	command = kzalloc(len, GFP_KERNEL); +	if (!command) +		return; + +	/* And make a command string from argv array */ +	p = command; +	for (i = 0; i < trace_probe_log.argc; i++) { +		len = strlen(trace_probe_log.argv[i]); +		strcpy(p, trace_probe_log.argv[i]); +		p[len] = ' '; +		p += len + 1; +	} +	*(p - 1) = '\0'; + +	tracing_log_err(NULL, trace_probe_log.subsystem, command, +			trace_probe_err_text, err_type, pos + offset); + +	kfree(command); +} +  /* Split symbol and offset. */  int traceprobe_split_symbol_offset(char *symbol, long *offset)  { @@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)  /* @buf must has MAX_EVENT_NAME_LEN size */  int traceprobe_parse_event_name(const char **pevent, const char **pgroup, -				char *buf) +				char *buf, int offset)  {  	const char *slash, *event = *pevent;  	int len; @@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,  	slash = strchr(event, '/');  	if (slash) {  		if (slash == event) { -			pr_info("Group name is not specified\n"); +			trace_probe_log_err(offset, NO_GROUP_NAME);  			return -EINVAL;  		}  		if (slash - event + 1 > MAX_EVENT_NAME_LEN) { -			pr_info("Group name is too long\n"); -			return -E2BIG; +			trace_probe_log_err(offset, GROUP_TOO_LONG); +			return -EINVAL;  		}  		strlcpy(buf, event, slash - event + 1);  		if (!is_good_name(buf)) { -			pr_info("Group name must follow the same rules as C identifiers\n"); +			trace_probe_log_err(offset, BAD_GROUP_NAME);  			return -EINVAL;  		}  		*pgroup = buf;  		*pevent = slash + 1; +		offset += slash - event + 1;  		event = *pevent;  	}  	len = strlen(event);  	if (len == 0) { -		pr_info("Event name is not specified\n"); +		trace_probe_log_err(offset, NO_EVENT_NAME);  		return -EINVAL;  	} else if (len > MAX_EVENT_NAME_LEN) { -		pr_info("Event name is too long\n"); -		return -E2BIG; +		trace_probe_log_err(offset, EVENT_TOO_LONG); +		return -EINVAL;  	}  	if (!is_good_name(event)) { -		pr_info("Event name must follow the same rules as C identifiers\n"); +		trace_probe_log_err(offset, BAD_EVENT_NAME);  		return -EINVAL;  	}  	return 0; @@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,  #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))  static int parse_probe_vars(char *arg, const struct fetch_type *t, -			    struct fetch_insn *code, unsigned int flags) +			struct fetch_insn *code, unsigned int flags, int offs)  {  	unsigned long param;  	int ret = 0;  	int len;  	if (strcmp(arg, "retval") == 0) { -		if (flags & TPARG_FL_RETURN) +		if (flags & TPARG_FL_RETURN) {  			code->op = FETCH_OP_RETVAL; -		else +		} else { +			trace_probe_log_err(offs, RETVAL_ON_PROBE);  			ret = -EINVAL; +		}  	} else if ((len = str_has_prefix(arg, "stack"))) {  		if (arg[len] == '\0') {  			code->op = FETCH_OP_STACKP;  		} else if (isdigit(arg[len])) {  			ret = kstrtoul(arg + len, 10, ¶m); -			if (ret || ((flags & TPARG_FL_KERNEL) && -				    param > PARAM_MAX_STACK)) +			if (ret) { +				goto inval_var; +			} else if ((flags & TPARG_FL_KERNEL) && +				    param > PARAM_MAX_STACK) { +				trace_probe_log_err(offs, BAD_STACK_NUM);  				ret = -EINVAL; -			else { +			} else {  				code->op = FETCH_OP_STACK;  				code->param = (unsigned int)param;  			}  		} else -			ret = -EINVAL; +			goto inval_var;  	} else if (strcmp(arg, "comm") == 0) {  		code->op = FETCH_OP_COMM;  #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API  	} else if (((flags & TPARG_FL_MASK) ==  		    (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&  		   (len = str_has_prefix(arg, "arg"))) { -		if (!isdigit(arg[len])) -			return -EINVAL;  		ret = kstrtoul(arg + len, 10, ¶m); -		if (ret || !param || param > PARAM_MAX_STACK) +		if (ret) { +			goto inval_var; +		} else if (!param || param > PARAM_MAX_STACK) { +			trace_probe_log_err(offs, BAD_ARG_NUM);  			return -EINVAL; +		}  		code->op = FETCH_OP_ARG;  		code->param = (unsigned int)param - 1;  #endif  	} else -		ret = -EINVAL; +		goto inval_var;  	return ret; + +inval_var: +	trace_probe_log_err(offs, BAD_VAR); +	return -EINVAL;  }  /* Recursive argument parser */  static int  parse_probe_arg(char *arg, const struct fetch_type *type,  		struct fetch_insn **pcode, struct fetch_insn *end, -		unsigned int flags) +		unsigned int flags, int offs)  {  	struct fetch_insn *code = *pcode;  	unsigned long param; @@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,  	switch (arg[0]) {  	case '$': -		ret = parse_probe_vars(arg + 1, type, code, flags); +		ret = parse_probe_vars(arg + 1, type, code, flags, offs);  		break;  	case '%':	/* named register */ @@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type,  			code->op = FETCH_OP_REG;  			code->param = (unsigned int)ret;  			ret = 0; -		} +		} else +			trace_probe_log_err(offs, BAD_REG_NAME);  		break;  	case '@':	/* memory, file-offset or symbol */  		if (isdigit(arg[1])) {  			ret = kstrtoul(arg + 1, 0, ¶m); -			if (ret) +			if (ret) { +				trace_probe_log_err(offs, BAD_MEM_ADDR);  				break; +			}  			/* load address */  			code->op = FETCH_OP_IMM;  			code->immediate = param;  		} else if (arg[1] == '+') {  			/* kprobes don't support file offsets */ -			if (flags & TPARG_FL_KERNEL) +			if (flags & TPARG_FL_KERNEL) { +				trace_probe_log_err(offs, FILE_ON_KPROBE);  				return -EINVAL; - +			}  			ret = kstrtol(arg + 2, 0, &offset); -			if (ret) +			if (ret) { +				trace_probe_log_err(offs, BAD_FILE_OFFS);  				break; +			}  			code->op = FETCH_OP_FOFFS;  			code->immediate = (unsigned long)offset;  // imm64?  		} else {  			/* uprobes don't support symbols */ -			if (!(flags & TPARG_FL_KERNEL)) +			if (!(flags & TPARG_FL_KERNEL)) { +				trace_probe_log_err(offs, SYM_ON_UPROBE);  				return -EINVAL; - +			}  			/* Preserve symbol for updating */  			code->op = FETCH_NOP_SYMBOL;  			code->data = kstrdup(arg + 1, GFP_KERNEL);  			if (!code->data)  				return -ENOMEM; -			if (++code == end) -				return -E2BIG; - +			if (++code == end) { +				trace_probe_log_err(offs, TOO_MANY_OPS); +				return -EINVAL; +			}  			code->op = FETCH_OP_IMM;  			code->immediate = 0;  		}  		/* These are fetching from memory */ -		if (++code == end) -			return -E2BIG; +		if (++code == end) { +			trace_probe_log_err(offs, TOO_MANY_OPS); +			return -EINVAL; +		}  		*pcode = code;  		code->op = FETCH_OP_DEREF;  		code->offset = offset; @@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type,  		/* fall through */  	case '-':  		tmp = strchr(arg, '('); -		if (!tmp) +		if (!tmp) { +			trace_probe_log_err(offs, DEREF_NEED_BRACE);  			return -EINVAL; - +		}  		*tmp = '\0';  		ret = kstrtol(arg, 0, &offset); -		if (ret) +		if (ret) { +			trace_probe_log_err(offs, BAD_DEREF_OFFS);  			break; - +		} +		offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);  		arg = tmp + 1;  		tmp = strrchr(arg, ')'); - -		if (tmp) { +		if (!tmp) { +			trace_probe_log_err(offs + strlen(arg), +					    DEREF_OPEN_BRACE); +			return -EINVAL; +		} else {  			const struct fetch_type *t2 = find_fetch_type(NULL);  			*tmp = '\0'; -			ret = parse_probe_arg(arg, t2, &code, end, flags); +			ret = parse_probe_arg(arg, t2, &code, end, flags, offs);  			if (ret)  				break; -			if (code->op == FETCH_OP_COMM) +			if (code->op == FETCH_OP_COMM) { +				trace_probe_log_err(offs, COMM_CANT_DEREF);  				return -EINVAL; -			if (++code == end) -				return -E2BIG; +			} +			if (++code == end) { +				trace_probe_log_err(offs, TOO_MANY_OPS); +				return -EINVAL; +			}  			*pcode = code;  			code->op = FETCH_OP_DEREF; @@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,  	}  	if (!ret && code->op == FETCH_OP_NOP) {  		/* Parsed, but do not find fetch method */ +		trace_probe_log_err(offs, BAD_FETCH_ARG);  		ret = -EINVAL;  	}  	return ret; @@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf,  		return -EINVAL;  	code++;  	if (code->op != FETCH_OP_NOP) -		return -E2BIG; +		return -EINVAL;  	*pcode = code;  	code->op = FETCH_OP_MOD_BF; @@ -392,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf,  /* String length checking wrapper */  static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, -		struct probe_arg *parg, unsigned int flags) +		struct probe_arg *parg, unsigned int flags, int offset)  {  	struct fetch_insn *code, *scode, *tmp = NULL; -	char *t, *t2; +	char *t, *t2, *t3;  	int ret, len; -	if (strlen(arg) > MAX_ARGSTR_LEN) { -		pr_info("Argument is too long.: %s\n",  arg); -		return -ENOSPC; +	len = strlen(arg); +	if (len > MAX_ARGSTR_LEN) { +		trace_probe_log_err(offset, ARG_TOO_LONG); +		return -EINVAL; +	} else if (len == 0) { +		trace_probe_log_err(offset, NO_ARG_BODY); +		return -EINVAL;  	} +  	parg->comm = kstrdup(arg, GFP_KERNEL); -	if (!parg->comm) { -		pr_info("Failed to allocate memory for command '%s'.\n", arg); +	if (!parg->comm)  		return -ENOMEM; -	} +  	t = strchr(arg, ':');  	if (t) {  		*t = '\0';  		t2 = strchr(++t, '[');  		if (t2) { -			*t2 = '\0'; -			parg->count = simple_strtoul(t2 + 1, &t2, 0); -			if (strcmp(t2, "]") || parg->count == 0) +			*t2++ = '\0'; +			t3 = strchr(t2, ']'); +			if (!t3) { +				offset += t2 + strlen(t2) - arg; +				trace_probe_log_err(offset, +						    ARRAY_NO_CLOSE); +				return -EINVAL; +			} else if (t3[1] != '\0') { +				trace_probe_log_err(offset + t3 + 1 - arg, +						    BAD_ARRAY_SUFFIX);  				return -EINVAL; -			if (parg->count > MAX_ARRAY_LEN) -				return -E2BIG; +			} +			*t3 = '\0'; +			if (kstrtouint(t2, 0, &parg->count) || !parg->count) { +				trace_probe_log_err(offset + t2 - arg, +						    BAD_ARRAY_NUM); +				return -EINVAL; +			} +			if (parg->count > MAX_ARRAY_LEN) { +				trace_probe_log_err(offset + t2 - arg, +						    ARRAY_TOO_BIG); +				return -EINVAL; +			}  		}  	} -	/* -	 * The default type of $comm should be "string", and it can't be -	 * dereferenced. -	 */ -	if (!t && strcmp(arg, "$comm") == 0) + +	/* Since $comm can not be dereferred, we can find $comm by strcmp */ +	if (strcmp(arg, "$comm") == 0) { +		/* The type of $comm must be "string", and not an array. */ +		if (parg->count || (t && strcmp(t, "string"))) +			return -EINVAL;  		parg->type = find_fetch_type("string"); -	else +	} else  		parg->type = find_fetch_type(t);  	if (!parg->type) { -		pr_info("Unsupported type: %s\n", t); +		trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);  		return -EINVAL;  	}  	parg->offset = *size; @@ -444,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  			 parg->count);  	} -	code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); +	code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);  	if (!code)  		return -ENOMEM;  	code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;  	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], -			      flags); +			      flags, offset);  	if (ret)  		goto fail; @@ -458,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  	if (!strcmp(parg->type->name, "string")) {  		if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&  		    code->op != FETCH_OP_COMM) { -			pr_info("string only accepts memory or address.\n"); +			trace_probe_log_err(offset + (t ? (t - arg) : 0), +					    BAD_STRING);  			ret = -EINVAL;  			goto fail;  		} @@ -470,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  			 */  			code++;  			if (code->op != FETCH_OP_NOP) { -				ret = -E2BIG; +				trace_probe_log_err(offset, TOO_MANY_OPS); +				ret = -EINVAL;  				goto fail;  			}  		} @@ -483,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  	} else {  		code++;  		if (code->op != FETCH_OP_NOP) { -			ret = -E2BIG; +			trace_probe_log_err(offset, TOO_MANY_OPS); +			ret = -EINVAL;  			goto fail;  		}  		code->op = FETCH_OP_ST_RAW; @@ -493,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  	/* Modify operation */  	if (t != NULL) {  		ret = __parse_bitfield_probe_arg(t, parg->type, &code); -		if (ret) +		if (ret) { +			trace_probe_log_err(offset + t - arg, BAD_BITFIELD);  			goto fail; +		}  	}  	/* Loop(Array) operation */  	if (parg->count) {  		if (scode->op != FETCH_OP_ST_MEM &&  		    scode->op != FETCH_OP_ST_STRING) { -			pr_info("array only accepts memory or address\n"); +			trace_probe_log_err(offset + (t ? (t - arg) : 0), +					    BAD_STRING);  			ret = -EINVAL;  			goto fail;  		}  		code++;  		if (code->op != FETCH_OP_NOP) { -			ret = -E2BIG; +			trace_probe_log_err(offset, TOO_MANY_OPS); +			ret = -EINVAL;  			goto fail;  		}  		code->op = FETCH_OP_LP_ARRAY; @@ -516,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,  	code->op = FETCH_OP_END;  	/* Shrink down the code buffer */ -	parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); +	parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);  	if (!parg->code)  		ret = -ENOMEM;  	else @@ -555,15 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,  {  	struct probe_arg *parg = &tp->args[i];  	char *body; -	int ret;  	/* Increment count for freeing args in error case */  	tp->nr_args++;  	body = strchr(arg, '=');  	if (body) { -		if (body - arg > MAX_ARG_NAME_LEN || body == arg) +		if (body - arg > MAX_ARG_NAME_LEN) { +			trace_probe_log_err(0, ARG_NAME_TOO_LONG); +			return -EINVAL; +		} else if (body == arg) { +			trace_probe_log_err(0, NO_ARG_NAME);  			return -EINVAL; +		}  		parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);  		body++;  	} else { @@ -575,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,  		return -ENOMEM;  	if (!is_good_name(parg->name)) { -		pr_info("Invalid argument[%d] name: %s\n", -			i, parg->name); +		trace_probe_log_err(0, BAD_ARG_NAME);  		return -EINVAL;  	} -  	if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { -		pr_info("Argument[%d]: '%s' conflicts with another field.\n", -			i, parg->name); +		trace_probe_log_err(0, USED_ARG_NAME);  		return -EINVAL;  	} -  	/* Parse fetch argument */ -	ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); -	if (ret) -		pr_info("Parse error at argument[%d]. (%d)\n", i, ret); -	return ret; +	return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, +					       body - arg);  }  void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2177c206de15..f9a8c632188b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -124,6 +124,7 @@ struct fetch_insn {  /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */  #define FETCH_INSN_MAX	16 +#define FETCH_TOKEN_COMM	(-ECOMM)  /* Fetch type information table */  struct fetch_type { @@ -280,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg);  extern void traceprobe_free_probe_arg(struct probe_arg *arg);  extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -extern int traceprobe_parse_event_name(const char **pevent, -				       const char **pgroup, char *buf); +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, +				char *buf, int offset);  extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); @@ -298,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);  #endif  extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,  					size_t offset, struct trace_probe *tp); + +#undef ERRORS +#define ERRORS	\ +	C(FILE_NOT_FOUND,	"Failed to find the given file"),	\ +	C(NO_REGULAR_FILE,	"Not a regular file"),			\ +	C(BAD_REFCNT,		"Invalid reference counter offset"),	\ +	C(REFCNT_OPEN_BRACE,	"Reference counter brace is not closed"), \ +	C(BAD_REFCNT_SUFFIX,	"Reference counter has wrong suffix"),	\ +	C(BAD_UPROBE_OFFS,	"Invalid uprobe offset"),		\ +	C(MAXACT_NO_KPROBE,	"Maxactive is not for kprobe"),		\ +	C(BAD_MAXACT,		"Invalid maxactive number"),		\ +	C(MAXACT_TOO_BIG,	"Maxactive is too big"),		\ +	C(BAD_PROBE_ADDR,	"Invalid probed address or symbol"),	\ +	C(BAD_RETPROBE,		"Retprobe address must be an function entry"), \ +	C(NO_GROUP_NAME,	"Group name is not specified"),		\ +	C(GROUP_TOO_LONG,	"Group name is too long"),		\ +	C(BAD_GROUP_NAME,	"Group name must follow the same rules as C identifiers"), \ +	C(NO_EVENT_NAME,	"Event name is not specified"),		\ +	C(EVENT_TOO_LONG,	"Event name is too long"),		\ +	C(BAD_EVENT_NAME,	"Event name must follow the same rules as C identifiers"), \ +	C(RETVAL_ON_PROBE,	"$retval is not available on probe"),	\ +	C(BAD_STACK_NUM,	"Invalid stack number"),		\ +	C(BAD_ARG_NUM,		"Invalid argument number"),		\ +	C(BAD_VAR,		"Invalid $-valiable specified"),	\ +	C(BAD_REG_NAME,		"Invalid register name"),		\ +	C(BAD_MEM_ADDR,		"Invalid memory address"),		\ +	C(FILE_ON_KPROBE,	"File offset is not available with kprobe"), \ +	C(BAD_FILE_OFFS,	"Invalid file offset value"),		\ +	C(SYM_ON_UPROBE,	"Symbol is not available with uprobe"),	\ +	C(TOO_MANY_OPS,		"Dereference is too much nested"), 	\ +	C(DEREF_NEED_BRACE,	"Dereference needs a brace"),		\ +	C(BAD_DEREF_OFFS,	"Invalid dereference offset"),		\ +	C(DEREF_OPEN_BRACE,	"Dereference brace is not closed"),	\ +	C(COMM_CANT_DEREF,	"$comm can not be dereferenced"),	\ +	C(BAD_FETCH_ARG,	"Invalid fetch argument"),		\ +	C(ARRAY_NO_CLOSE,	"Array is not closed"),			\ +	C(BAD_ARRAY_SUFFIX,	"Array has wrong suffix"),		\ +	C(BAD_ARRAY_NUM,	"Invalid array size"),			\ +	C(ARRAY_TOO_BIG,	"Array number is too big"),		\ +	C(BAD_TYPE,		"Unknown type is specified"),		\ +	C(BAD_STRING,		"String accepts only memory argument"),	\ +	C(BAD_BITFIELD,		"Invalid bitfield"),			\ +	C(ARG_NAME_TOO_LONG,	"Argument name is too long"),		\ +	C(NO_ARG_NAME,		"Argument name is not specified"),	\ +	C(BAD_ARG_NAME,		"Argument name must follow the same rules as C identifiers"), \ +	C(USED_ARG_NAME,	"This argument name is already used"),	\ +	C(ARG_TOO_LONG,		"Argument expression is too long"),	\ +	C(NO_ARG_BODY,		"No argument expression"),		\ +	C(BAD_INSN_BNDRY,	"Probe point is not an instruction boundary"),\ +	C(FAIL_REG_PROBE,	"Failed to register probe event"), + +#undef C +#define C(a, b)		TP_ERR_##a + +/* Define TP_ERR_ */ +enum { ERRORS }; + +/* Error text is defined in trace_probe.c */ + +struct trace_probe_log { +	const char	*subsystem; +	const char	**argv; +	int		argc; +	int		index; +}; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +void trace_probe_log_set_index(int index); +void trace_probe_log_clear(void); +void __trace_probe_log_err(int offset, int err); + +#define trace_probe_log_err(offs, err)	\ +	__trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4737bb8c07a3..c30c61f12ddd 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -88,7 +88,7 @@ stage3:  	/* 3rd stage: store value to buffer */  	if (unlikely(!dest)) {  		if (code->op == FETCH_OP_ST_STRING) { -			ret += fetch_store_strlen(val + code->offset); +			ret = fetch_store_strlen(val + code->offset);  			code++;  			goto array;  		} else diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9d402e7fc949..69ee8ef12cee 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	/* check the trace buffer */  	ret = trace_test_buffer(&tr->trace_buffer, &count); -	trace->reset(tr); +	/* Need to also simulate the tr->reset to remove this fgraph_ops */ +	tracing_stop_cmdline_record(); +	unregister_ftrace_graph(&fgraph_ops); +  	tracing_start();  	if (!ret && !count) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index eec648a0d673..5d16f73898db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,44 +18,32 @@  #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = -	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -unsigned stack_trace_index[STACK_TRACE_ENTRIES]; +#define STACK_TRACE_ENTRIES 500 -/* - * Reserve one entry for the passed in ip. This will allow - * us to remove most or all of the stack size overhead - * added by the stack tracer itself. - */ -struct stack_trace stack_trace_max = { -	.max_entries		= STACK_TRACE_ENTRIES - 1, -	.entries		= &stack_dump_trace[0], -}; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -unsigned long stack_trace_max_size; -arch_spinlock_t stack_trace_max_lock = +static unsigned int stack_trace_nr_entries; +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock =  	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  DEFINE_PER_CPU(int, disable_stack_tracer);  static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled; -static int last_stack_tracer_enabled; -void stack_trace_print(void) +static void print_max_stack(void)  {  	long i;  	int size;  	pr_emerg("        Depth    Size   Location    (%d entries)\n"  			   "        -----    ----   --------\n", -			   stack_trace_max.nr_entries); +			   stack_trace_nr_entries); -	for (i = 0; i < stack_trace_max.nr_entries; i++) { -		if (stack_dump_trace[i] == ULONG_MAX) -			break; -		if (i+1 == stack_trace_max.nr_entries || -				stack_dump_trace[i+1] == ULONG_MAX) +	for (i = 0; i < stack_trace_nr_entries; i++) { +		if (i + 1 == stack_trace_nr_entries)  			size = stack_trace_index[i];  		else  			size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -65,16 +53,7 @@ void stack_trace_print(void)  	}  } -/* - * When arch-specific code overrides this function, the following - * data should be filled up, assuming stack_trace_max_lock is held to - * prevent concurrent updates. - *     stack_trace_index[] - *     stack_trace_max - *     stack_trace_max_size - */ -void __weak -check_stack(unsigned long ip, unsigned long *stack) +static void check_stack(unsigned long ip, unsigned long *stack)  {  	unsigned long this_size, flags; unsigned long *p, *top, *start;  	static int tracer_frame; @@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)  	stack_trace_max_size = this_size; -	stack_trace_max.nr_entries = 0; -	stack_trace_max.skip = 0; - -	save_stack_trace(&stack_trace_max); +	stack_trace_nr_entries = stack_trace_save(stack_dump_trace, +					       ARRAY_SIZE(stack_dump_trace) - 1, +					       0);  	/* Skip over the overhead of the stack tracer itself */ -	for (i = 0; i < stack_trace_max.nr_entries; i++) { +	for (i = 0; i < stack_trace_nr_entries; i++) {  		if (stack_dump_trace[i] == ip)  			break;  	} @@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)  	 * Some archs may not have the passed in ip in the dump.  	 * If that happens, we need to show everything.  	 */ -	if (i == stack_trace_max.nr_entries) +	if (i == stack_trace_nr_entries)  		i = 0;  	/* @@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)  	 * loop will only happen once. This code only takes place  	 * on a new max, so it is far from a fast path.  	 */ -	while (i < stack_trace_max.nr_entries) { +	while (i < stack_trace_nr_entries) {  		int found = 0;  		stack_trace_index[x] = this_size;  		p = start; -		for (; p < top && i < stack_trace_max.nr_entries; p++) { -			if (stack_dump_trace[i] == ULONG_MAX) -				break; +		for (; p < top && i < stack_trace_nr_entries; p++) {  			/*  			 * The READ_ONCE_NOCHECK is used to let KASAN know that  			 * this is not a stack-out-of-bounds error. @@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)  			i++;  	} -	stack_trace_max.nr_entries = x; -	for (; x < i; x++) -		stack_dump_trace[x] = ULONG_MAX; +	stack_trace_nr_entries = x;  	if (task_stack_end_corrupted(current)) { -		stack_trace_print(); +		print_max_stack();  		BUG();  	} @@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)  {  	long n = *pos - 1; -	if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) +	if (n >= stack_trace_nr_entries)  		return NULL;  	m->private = (void *)n; @@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)  		seq_printf(m, "        Depth    Size   Location"  			   "    (%d entries)\n"  			   "        -----    ----   --------\n", -			   stack_trace_max.nr_entries); +			   stack_trace_nr_entries);  		if (!stack_tracer_enabled && !stack_trace_max_size)  			print_disabled(m); @@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)  	i = *(long *)v; -	if (i >= stack_trace_max.nr_entries || -	    stack_dump_trace[i] == ULONG_MAX) +	if (i >= stack_trace_nr_entries)  		return 0; -	if (i+1 == stack_trace_max.nr_entries || -	    stack_dump_trace[i+1] == ULONG_MAX) +	if (i + 1 == stack_trace_nr_entries)  		size = stack_trace_index[i];  	else  		size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,  		   void __user *buffer, size_t *lenp,  		   loff_t *ppos)  { +	int was_enabled;  	int ret;  	mutex_lock(&stack_sysctl_mutex); +	was_enabled = !!stack_tracer_enabled;  	ret = proc_dointvec(table, write, buffer, lenp, ppos); -	if (ret || !write || -	    (last_stack_tracer_enabled == !!stack_tracer_enabled)) +	if (ret || !write || (was_enabled == !!stack_tracer_enabled))  		goto out; -	last_stack_tracer_enabled = !!stack_tracer_enabled; -  	if (stack_tracer_enabled)  		register_ftrace_function(&trace_ops);  	else  		unregister_ftrace_function(&trace_ops); -   out:  	mutex_unlock(&stack_sysctl_mutex);  	return ret; @@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)  		strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);  	stack_tracer_enabled = 1; -	last_stack_tracer_enabled = 1;  	return 1;  }  __setup("stacktrace", enable_stacktrace); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f93a56d2db27..fa8fbff736d6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	unsigned long irq_flags; +	unsigned long args[6];  	int pc;  	int syscall_nr;  	int size; @@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	entry = ring_buffer_event_data(event);  	entry->nr = syscall_nr; -	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); +	syscall_get_arguments(current, regs, args); +	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);  	event_trigger_unlock_commit(trace_file, buffer, event, entry,  				    irq_flags, pc); @@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	struct syscall_metadata *sys_data;  	struct syscall_trace_enter *rec;  	struct hlist_head *head; +	unsigned long args[6];  	bool valid_prog_array;  	int syscall_nr;  	int rctx; @@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  		return;  	rec->nr = syscall_nr; -	syscall_get_arguments(current, regs, 0, sys_data->nb_args, -			       (unsigned long *)&rec->args); +	syscall_get_arguments(current, regs, args); +	memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);  	if ((valid_prog_array &&  	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index be78d99ee6bc..eb7e06b54741 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base)  	if (unlikely(!maxlen))  		return -ENOMEM; -	ret = strncpy_from_user(dst, src, maxlen); +	if (addr == FETCH_TOKEN_COMM) +		ret = strlcpy(dst, current->comm, maxlen); +	else +		ret = strncpy_from_user(dst, src, maxlen);  	if (ret >= 0) {  		if (ret == maxlen)  			dst[ret - 1] = '\0'; @@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr)  	int len;  	void __user *vaddr = (void __force __user *) addr; -	len = strnlen_user(vaddr, MAX_STRING_SIZE); +	if (addr == FETCH_TOKEN_COMM) +		len = strlen(current->comm) + 1; +	else +		len = strnlen_user(vaddr, MAX_STRING_SIZE);  	return (len > MAX_STRING_SIZE) ? 0 : len;  } @@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,  	case FETCH_OP_IMM:  		val = code->immediate;  		break; +	case FETCH_OP_COMM: +		val = FETCH_TOKEN_COMM; +		break;  	case FETCH_OP_FOFFS:  		val = translate_user_vaddr(code->immediate);  		break; @@ -457,13 +466,19 @@ static int trace_uprobe_create(int argc, const char **argv)  		return -ECANCELED;  	} +	trace_probe_log_init("trace_uprobe", argc, argv); +	trace_probe_log_set_index(1);	/* filename is the 2nd argument */ +  	*arg++ = '\0';  	ret = kern_path(filename, LOOKUP_FOLLOW, &path);  	if (ret) { +		trace_probe_log_err(0, FILE_NOT_FOUND);  		kfree(filename); +		trace_probe_log_clear();  		return ret;  	}  	if (!d_is_reg(path.dentry)) { +		trace_probe_log_err(0, NO_REGULAR_FILE);  		ret = -EINVAL;  		goto fail_address_parse;  	} @@ -472,9 +487,16 @@ static int trace_uprobe_create(int argc, const char **argv)  	rctr = strchr(arg, '(');  	if (rctr) {  		rctr_end = strchr(rctr, ')'); -		if (rctr > rctr_end || *(rctr_end + 1) != 0) { +		if (!rctr_end) { +			ret = -EINVAL; +			rctr_end = rctr + strlen(rctr); +			trace_probe_log_err(rctr_end - filename, +					    REFCNT_OPEN_BRACE); +			goto fail_address_parse; +		} else if (rctr_end[1] != '\0') {  			ret = -EINVAL; -			pr_info("Invalid reference counter offset.\n"); +			trace_probe_log_err(rctr_end + 1 - filename, +					    BAD_REFCNT_SUFFIX);  			goto fail_address_parse;  		} @@ -482,22 +504,23 @@ static int trace_uprobe_create(int argc, const char **argv)  		*rctr_end = '\0';  		ret = kstrtoul(rctr, 0, &ref_ctr_offset);  		if (ret) { -			pr_info("Invalid reference counter offset.\n"); +			trace_probe_log_err(rctr - filename, BAD_REFCNT);  			goto fail_address_parse;  		}  	}  	/* Parse uprobe offset. */  	ret = kstrtoul(arg, 0, &offset); -	if (ret) +	if (ret) { +		trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);  		goto fail_address_parse; - -	argc -= 2; -	argv += 2; +	}  	/* setup a probe */ +	trace_probe_log_set_index(0);  	if (event) { -		ret = traceprobe_parse_event_name(&event, &group, buf); +		ret = traceprobe_parse_event_name(&event, &group, buf, +						  event - argv[0]);  		if (ret)  			goto fail_address_parse;  	} else { @@ -519,6 +542,9 @@ static int trace_uprobe_create(int argc, const char **argv)  		kfree(tail);  	} +	argc -= 2; +	argv += 2; +  	tu = alloc_trace_uprobe(group, event, argc, is_return);  	if (IS_ERR(tu)) {  		ret = PTR_ERR(tu); @@ -539,6 +565,7 @@ static int trace_uprobe_create(int argc, const char **argv)  			goto error;  		} +		trace_probe_log_set_index(i + 2);  		ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,  					is_return ? TPARG_FL_RETURN : 0);  		kfree(tmp); @@ -547,20 +574,20 @@ static int trace_uprobe_create(int argc, const char **argv)  	}  	ret = register_trace_uprobe(tu); -	if (ret) -		goto error; -	return 0; +	if (!ret) +		goto out;  error:  	free_trace_uprobe(tu); +out: +	trace_probe_log_clear();  	return ret;  fail_address_parse: +	trace_probe_log_clear();  	path_put(&path);  	kfree(filename); -	pr_info("Failed to parse address or file.\n"); -  	return ret;  } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 46f2ab1e08a9..df3ade14ccbd 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * Copyright (C) 2008-2014 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.   */  #include <linux/module.h>  #include <linux/mutex.h> diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b45391..7be3e7530841 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later  /*   * tsacct.c - System accounting over taskstats interface   *   * Copyright (C) Jay Lan,	<jlan@sgi.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - *   */  #include <linux/kernel.h> diff --git a/kernel/umh.c b/kernel/umh.c index d937cbad903a..7f255b5a8845 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * umh - the kernel usermode helper   */ diff --git a/kernel/up.c b/kernel/up.c index ff536f9cc8a2..483c9962c999 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * Uniprocessor-only support functions.  The counterpart to kernel/smp.c   */ diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 9586b670a5b2..870ecd7c63ed 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #include <linux/user-return-notifier.h>  #include <linux/percpu.h> diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..78b17e36e705 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * The "user cache".   * @@ -185,7 +186,7 @@ struct user_struct *alloc_uid(kuid_t uid)  	if (!up) {  		new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);  		if (!new) -			goto out_unlock; +			return NULL;  		new->uid = uid;  		refcount_set(&new->__count, 1); @@ -199,8 +200,6 @@ struct user_struct *alloc_uid(kuid_t uid)  		spin_lock_irq(&uidhash_lock);  		up = uid_hash_find(uid, hashent);  		if (up) { -			key_put(new->uid_keyring); -			key_put(new->session_keyring);  			kmem_cache_free(uid_cachep, new);  		} else {  			uid_hash_insert(new, hashent); @@ -210,9 +209,6 @@ struct user_struct *alloc_uid(kuid_t uid)  	}  	return up; - -out_unlock: -	return NULL;  }  static int __init uid_cache_init(void) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8fbfda94a67b..7f9e7b9306fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1;  int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;  int __read_mostly soft_watchdog_user_enabled = 1;  int __read_mostly watchdog_thresh = 10; -int __read_mostly nmi_watchdog_available; +static int __read_mostly nmi_watchdog_available; -struct cpumask watchdog_allowed_mask __read_mostly; +static struct cpumask watchdog_allowed_mask __read_mostly;  struct cpumask watchdog_cpumask __read_mostly;  unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -554,13 +554,15 @@ static void softlockup_start_all(void)  int lockup_detector_online_cpu(unsigned int cpu)  { -	watchdog_enable(cpu); +	if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) +		watchdog_enable(cpu);  	return 0;  }  int lockup_detector_offline_cpu(unsigned int cpu)  { -	watchdog_disable(cpu); +	if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) +		watchdog_disable(cpu);  	return 0;  } @@ -588,7 +590,7 @@ static void lockup_detector_reconfigure(void)   * Create the watchdog thread infrastructure and configure the detector(s).   *   * The threads are not unparked as watchdog_allowed_mask is empty.  When - * the threads are sucessfully initialized, take the proper locks and + * the threads are successfully initialized, take the proper locks and   * unpark the threads in the watchdog_cpumask if the watchdog is enabled.   */  static __init void lockup_detector_setup(void) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71381168dede..247bf0b1582c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,  		if (__this_cpu_read(hard_watchdog_warn) == true)  			return; -		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); +		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", +			 this_cpu);  		print_modules();  		print_irqtrace_events(current);  		if (regs) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4026d1871407..95aea04ff722 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  /*   * kernel/workqueue.c - generic async execution with shared worker pool   * @@ -127,16 +128,16 @@ enum {   *   * PL: wq_pool_mutex protected.   * - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.   *   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.   *   * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or - *      sched-RCU for reads. + *      RCU for reads.   *   * WQ: wq->mutex protected.   * - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads. + * WR: wq->mutex protected for writes.  RCU protected for reads.   *   * MD: wq_mayday_lock protected.   */ @@ -183,7 +184,7 @@ struct worker_pool {  	atomic_t		nr_running ____cacheline_aligned_in_smp;  	/* -	 * Destruction of pool is sched-RCU protected to allow dereferences +	 * Destruction of pool is RCU protected to allow dereferences  	 * from get_work_pool().  	 */  	struct rcu_head		rcu; @@ -212,7 +213,7 @@ struct pool_workqueue {  	/*  	 * Release of unbound pwq is punted to system_wq.  See put_pwq()  	 * and pwq_unbound_release_workfn() for details.  pool_workqueue -	 * itself is also sched-RCU protected so that the first pwq can be +	 * itself is also RCU protected so that the first pwq can be  	 * determined without grabbing wq->mutex.  	 */  	struct work_struct	unbound_release_work; @@ -266,8 +267,8 @@ struct workqueue_struct {  	char			name[WQ_NAME_LEN]; /* I: workqueue name */  	/* -	 * Destruction of workqueue_struct is sched-RCU protected to allow -	 * walking the workqueues list without grabbing wq_pool_mutex. +	 * Destruction of workqueue_struct is RCU protected to allow walking +	 * the workqueues list without grabbing wq_pool_mutex.  	 * This is used to dump all workqueues from sysrq.  	 */  	struct rcu_head		rcu; @@ -359,20 +360,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);  #include <trace/events/workqueue.h>  #define assert_rcu_or_pool_mutex()					\ -	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\ +	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\  			 !lockdep_is_held(&wq_pool_mutex),		\ -			 "sched RCU or wq_pool_mutex should be held") +			 "RCU or wq_pool_mutex should be held")  #define assert_rcu_or_wq_mutex(wq)					\ -	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\ +	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\  			 !lockdep_is_held(&wq->mutex),			\ -			 "sched RCU or wq->mutex should be held") +			 "RCU or wq->mutex should be held")  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\ -	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\ +	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\  			 !lockdep_is_held(&wq->mutex) &&		\  			 !lockdep_is_held(&wq_pool_mutex),		\ -			 "sched RCU, wq->mutex or wq_pool_mutex should be held") +			 "RCU, wq->mutex or wq_pool_mutex should be held")  #define for_each_cpu_worker_pool(pool, cpu)				\  	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\ @@ -384,7 +385,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);   * @pool: iteration cursor   * @pi: integer used for iteration   * - * This must be called either with wq_pool_mutex held or sched RCU read + * This must be called either with wq_pool_mutex held or RCU read   * locked.  If the pool needs to be used beyond the locking in effect, the   * caller is responsible for guaranteeing that the pool stays online.   * @@ -416,7 +417,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);   * @pwq: iteration cursor   * @wq: the target workqueue   * - * This must be called either with wq->mutex held or sched RCU read locked. + * This must be called either with wq->mutex held or RCU read locked.   * If the pwq needs to be used beyond the locking in effect, the caller is   * responsible for guaranteeing that the pwq stays online.   * @@ -552,7 +553,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)   * @wq: the target workqueue   * @node: the node ID   * - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU + * This must be called with any of wq_pool_mutex, wq->mutex or RCU   * read locked.   * If the pwq needs to be used beyond the locking in effect, the caller is   * responsible for guaranteeing that the pwq stays online. @@ -696,8 +697,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)   * @work: the work item of interest   *   * Pools are created and destroyed under wq_pool_mutex, and allows read - * access under sched-RCU read lock.  As such, this function should be - * called under wq_pool_mutex or with preemption disabled. + * access under RCU read lock.  As such, this function should be + * called under wq_pool_mutex or inside of a rcu_read_lock() region.   *   * All fields of the returned pool are accessible as long as the above   * mentioned locking is in effect.  If the returned pool needs to be used @@ -841,43 +842,32 @@ static void wake_up_worker(struct worker_pool *pool)  }  /** - * wq_worker_waking_up - a worker is waking up + * wq_worker_running - a worker is running again   * @task: task waking up - * @cpu: CPU @task is waking up to   * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule()   */ -void wq_worker_waking_up(struct task_struct *task, int cpu) +void wq_worker_running(struct task_struct *task)  {  	struct worker *worker = kthread_data(task); -	if (!(worker->flags & WORKER_NOT_RUNNING)) { -		WARN_ON_ONCE(worker->pool->cpu != cpu); +	if (!worker->sleeping) +		return; +	if (!(worker->flags & WORKER_NOT_RUNNING))  		atomic_inc(&worker->pool->nr_running); -	} +	worker->sleeping = 0;  }  /**   * wq_worker_sleeping - a worker is going to sleep   * @task: task going to sleep   * - * This function is called during schedule() when a busy worker is - * going to sleep.  Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * Return: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep.   */ -struct task_struct *wq_worker_sleeping(struct task_struct *task) +void wq_worker_sleeping(struct task_struct *task)  { -	struct worker *worker = kthread_data(task), *to_wakeup = NULL; +	struct worker *next, *worker = kthread_data(task);  	struct worker_pool *pool;  	/* @@ -886,13 +876,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)  	 * checking NOT_RUNNING.  	 */  	if (worker->flags & WORKER_NOT_RUNNING) -		return NULL; +		return;  	pool = worker->pool; -	/* this can only happen on the local cpu */ -	if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) -		return NULL; +	if (WARN_ON_ONCE(worker->sleeping)) +		return; + +	worker->sleeping = 1; +	spin_lock_irq(&pool->lock);  	/*  	 * The counterpart of the following dec_and_test, implied mb, @@ -906,13 +898,17 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)  	 * lock is safe.  	 */  	if (atomic_dec_and_test(&pool->nr_running) && -	    !list_empty(&pool->worklist)) -		to_wakeup = first_idle_worker(pool); -	return to_wakeup ? to_wakeup->task : NULL; +	    !list_empty(&pool->worklist)) { +		next = first_idle_worker(pool); +		if (next) +			wake_up_process(next->task); +	} +	spin_unlock_irq(&pool->lock);  }  /**   * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of.   *   * Determine the last function a worker executed. This is called from   * the scheduler to get a worker's last known identity. @@ -1132,7 +1128,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)  {  	if (pwq) {  		/* -		 * As both pwqs and pools are sched-RCU protected, the +		 * As both pwqs and pools are RCU protected, the  		 * following lock operations are safe.  		 */  		spin_lock_irq(&pwq->pool->lock); @@ -1260,6 +1256,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,  	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))  		return 0; +	rcu_read_lock();  	/*  	 * The queueing is in progress, or it is already queued. Try to  	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. @@ -1298,10 +1295,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,  		set_work_pool_and_keep_pending(work, pool->id);  		spin_unlock(&pool->lock); +		rcu_read_unlock();  		return 1;  	}  	spin_unlock(&pool->lock);  fail: +	rcu_read_unlock();  	local_irq_restore(*flags);  	if (work_is_canceling(work))  		return -ENOENT; @@ -1415,6 +1414,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,  	if (unlikely(wq->flags & __WQ_DRAINING) &&  	    WARN_ON_ONCE(!is_chained_work(wq)))  		return; +	rcu_read_lock();  retry:  	if (req_cpu == WORK_CPU_UNBOUND)  		cpu = wq_select_unbound_cpu(raw_smp_processor_id()); @@ -1471,10 +1471,8 @@ retry:  	/* pwq determined, queue */  	trace_workqueue_queue_work(req_cpu, pwq, work); -	if (WARN_ON(!list_empty(&work->entry))) { -		spin_unlock(&pwq->pool->lock); -		return; -	} +	if (WARN_ON(!list_empty(&work->entry))) +		goto out;  	pwq->nr_in_flight[pwq->work_color]++;  	work_flags = work_color_to_flags(pwq->work_color); @@ -1492,7 +1490,9 @@ retry:  	insert_work(pwq, work, worklist, work_flags); +out:  	spin_unlock(&pwq->pool->lock); +	rcu_read_unlock();  }  /** @@ -2277,7 +2277,7 @@ __acquires(&pool->lock)  	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {  		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" -		       "     last function: %pf\n", +		       "     last function: %ps\n",  		       current->comm, preempt_count(), task_pid_nr(current),  		       worker->current_func);  		debug_show_held_locks(current); @@ -2596,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,  	worker = current_wq_worker();  	WARN_ONCE(current->flags & PF_MEMALLOC, -		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", +		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",  		  current->pid, current->comm, target_wq->name, target_func);  	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &  			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), -		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", +		  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",  		  worker->current_pwq->wq->name, worker->current_func,  		  target_wq->name, target_func);  } @@ -2974,14 +2974,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,  	might_sleep(); -	local_irq_disable(); +	rcu_read_lock();  	pool = get_work_pool(work);  	if (!pool) { -		local_irq_enable(); +		rcu_read_unlock();  		return false;  	} -	spin_lock(&pool->lock); +	spin_lock_irq(&pool->lock);  	/* see the comment in try_to_grab_pending() with the same code */  	pwq = get_work_pwq(work);  	if (pwq) { @@ -3013,10 +3013,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,  		lock_map_acquire(&pwq->wq->lockdep_map);  		lock_map_release(&pwq->wq->lockdep_map);  	} - +	rcu_read_unlock();  	return true;  already_gone:  	spin_unlock_irq(&pool->lock); +	rcu_read_unlock();  	return false;  } @@ -3503,7 +3504,7 @@ static void rcu_free_pool(struct rcu_head *rcu)   * put_unbound_pool - put a worker_pool   * @pool: worker_pool to put   * - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU   * safe manner.  get_unbound_pool() calls this function on its failure path   * and this function should be able to release pools which went through,   * successfully or not, init_worker_pool(). @@ -3557,7 +3558,7 @@ static void put_unbound_pool(struct worker_pool *pool)  	del_timer_sync(&pool->idle_timer);  	del_timer_sync(&pool->mayday_timer); -	/* sched-RCU protected to allow dereferences from get_work_pool() */ +	/* RCU protected to allow dereferences from get_work_pool() */  	call_rcu(&pool->rcu, rcu_free_pool);  } @@ -4208,6 +4209,7 @@ static int init_rescuer(struct workqueue_struct *wq)  	return 0;  } +__printf(1, 4)  struct workqueue_struct *alloc_workqueue(const char *fmt,  					 unsigned int flags,  					 int max_active, ...) @@ -4266,7 +4268,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,  	INIT_LIST_HEAD(&wq->list);  	if (alloc_and_link_pwqs(wq) < 0) -		goto err_free_wq; +		goto err_unreg_lockdep;  	if (wq_online && init_rescuer(wq) < 0)  		goto err_destroy; @@ -4292,9 +4294,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,  	return wq; -err_free_wq: +err_unreg_lockdep:  	wq_unregister_lockdep(wq);  	wq_free_lockdep(wq); +err_free_wq:  	free_workqueue_attrs(wq->unbound_attrs);  	kfree(wq);  	return NULL; @@ -4470,7 +4473,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)  	struct pool_workqueue *pwq;  	bool ret; -	rcu_read_lock_sched(); +	rcu_read_lock(); +	preempt_disable();  	if (cpu == WORK_CPU_UNBOUND)  		cpu = smp_processor_id(); @@ -4481,7 +4485,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)  		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));  	ret = !list_empty(&pwq->delayed_works); -	rcu_read_unlock_sched(); +	preempt_enable(); +	rcu_read_unlock();  	return ret;  } @@ -4507,15 +4512,15 @@ unsigned int work_busy(struct work_struct *work)  	if (work_pending(work))  		ret |= WORK_BUSY_PENDING; -	local_irq_save(flags); +	rcu_read_lock();  	pool = get_work_pool(work);  	if (pool) { -		spin_lock(&pool->lock); +		spin_lock_irqsave(&pool->lock, flags);  		if (find_worker_executing_work(pool, work))  			ret |= WORK_BUSY_RUNNING; -		spin_unlock(&pool->lock); +		spin_unlock_irqrestore(&pool->lock, flags);  	} -	local_irq_restore(flags); +	rcu_read_unlock();  	return ret;  } @@ -4586,7 +4591,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)  	probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);  	if (fn || name[0] || desc[0]) { -		printk("%sWorkqueue: %s %pf", log_lvl, name, fn); +		printk("%sWorkqueue: %s %ps", log_lvl, name, fn);  		if (strcmp(name, desc))  			pr_cont(" (%s)", desc);  		pr_cont("\n"); @@ -4611,7 +4616,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)  		pr_cont("%s BAR(%d)", comma ? "," : "",  			task_pid_nr(barr->task));  	} else { -		pr_cont("%s %pf", comma ? "," : "", work->func); +		pr_cont("%s %ps", comma ? "," : "", work->func);  	}  } @@ -4643,7 +4648,7 @@ static void show_pwq(struct pool_workqueue *pwq)  			if (worker->current_pwq != pwq)  				continue; -			pr_cont("%s %d%s:%pf", comma ? "," : "", +			pr_cont("%s %d%s:%ps", comma ? "," : "",  				task_pid_nr(worker->task),  				worker == pwq->wq->rescuer ? "(RESCUER)" : "",  				worker->current_func); @@ -4699,7 +4704,7 @@ void show_workqueue_state(void)  	unsigned long flags;  	int pi; -	rcu_read_lock_sched(); +	rcu_read_lock();  	pr_info("Showing busy workqueues and worker pools:\n"); @@ -4764,7 +4769,7 @@ void show_workqueue_state(void)  		touch_nmi_watchdog();  	} -	rcu_read_unlock_sched(); +	rcu_read_unlock();  }  /* used to show worker information through /proc/PID/{comm,stat,status} */ @@ -4928,7 +4933,7 @@ static void rebind_workers(struct worker_pool *pool)  		 *  		 * WRITE_ONCE() is necessary because @worker->flags may be  		 * tested without holding any lock in -		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may +		 * wq_worker_running().  Without it, NOT_RUNNING test may  		 * fail incorrectly leading to premature concurrency  		 * management operations.  		 */ @@ -5151,16 +5156,16 @@ bool freeze_workqueues_busy(void)  		 * nr_active is monotonically decreasing.  It's safe  		 * to peek without lock.  		 */ -		rcu_read_lock_sched(); +		rcu_read_lock();  		for_each_pwq(pwq, wq) {  			WARN_ON_ONCE(pwq->nr_active < 0);  			if (pwq->nr_active) {  				busy = true; -				rcu_read_unlock_sched(); +				rcu_read_unlock();  				goto out_unlock;  			}  		} -		rcu_read_unlock_sched(); +		rcu_read_unlock();  	}  out_unlock:  	mutex_unlock(&wq_pool_mutex); @@ -5355,7 +5360,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,  	const char *delim = "";  	int node, written = 0; -	rcu_read_lock_sched(); +	get_online_cpus(); +	rcu_read_lock();  	for_each_node(node) {  		written += scnprintf(buf + written, PAGE_SIZE - written,  				     "%s%d:%d", delim, node, @@ -5363,7 +5369,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,  		delim = " ";  	}  	written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); -	rcu_read_unlock_sched(); +	rcu_read_unlock(); +	put_online_cpus();  	return written;  } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cb68b03ca89a..498de0e909a4 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -44,6 +44,7 @@ struct worker {  	unsigned long		last_active;	/* L: last active timestamp */  	unsigned int		flags;		/* X: flags */  	int			id;		/* I: worker id */ +	int			sleeping;	/* None */  	/*  	 * Opaque string set with work_set_desc().  Printed out with task @@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)   * Scheduler hooks for concurrency managed workqueue.  Only to be used from   * sched/ and workqueue.c.   */ -void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task);  work_func_t wq_worker_last_func(struct task_struct *task);  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */  | 

