diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 101 | ||||
-rw-r--r-- | kernel/audit.c | 2 | ||||
-rw-r--r-- | kernel/audit.h | 18 | ||||
-rw-r--r-- | kernel/audit_fsnotify.c | 216 | ||||
-rw-r--r-- | kernel/audit_tree.c | 2 | ||||
-rw-r--r-- | kernel/audit_watch.c | 56 | ||||
-rw-r--r-- | kernel/auditfilter.c | 83 | ||||
-rw-r--r-- | kernel/auditsc.c | 9 | ||||
-rw-r--r-- | kernel/memremap.c | 190 | ||||
-rw-r--r-- | kernel/module_signing.c | 213 | ||||
-rw-r--r-- | kernel/ptrace.c | 13 | ||||
-rw-r--r-- | kernel/resource.c | 61 | ||||
-rw-r--r-- | kernel/seccomp.c | 17 | ||||
-rw-r--r-- | kernel/system_certificates.S | 20 | ||||
-rw-r--r-- | kernel/system_keyring.c | 106 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 9 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 764 | ||||
-rw-r--r-- | kernel/trace/trace.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 25 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 54 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 68 |
23 files changed, 1140 insertions, 899 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 718fb8afab7a..e0d7587e7684 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,7 +45,6 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -65,7 +64,7 @@ obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o @@ -100,6 +99,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_HAS_IOMEM) += memremap.o + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. @@ -112,99 +113,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) - -############################################################################### -# -# Roll all the X.509 certificates that we can find together and pull them into -# the kernel so that they get loaded into the system trusted keyring during -# boot. -# -# We look in the source root and the build root for all files whose name ends -# in ".x509". Unfortunately, this will generate duplicate filenames, so we -# have make canonicalise the pathnames and then sort them to discard the -# duplicates. -# -############################################################################### -ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) -X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 -X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ - $(or $(realpath $(CERT)),$(CERT)))) -X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) - -ifeq ($(X509_CERTIFICATES),) -$(warning *** No X.509 certificates found ***) -endif - -ifneq ($(wildcard $(obj)/.x509.list),) -ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") -$(shell rm $(obj)/.x509.list) -endif -endif - -kernel/system_certificates.o: $(obj)/x509_certificate_list - -quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") - -targets += $(obj)/x509_certificate_list -$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list - $(call if_changed,x509certs) - -targets += $(obj)/.x509.list -$(obj)/.x509.list: - @echo $(X509_CERTIFICATES) >$@ -endif - -clean-files := x509_certificate_list .x509.list - -ifeq ($(CONFIG_MODULE_SIG),y) -############################################################################### -# -# If module signing is requested, say by allyesconfig, but a key has not been -# supplied, then one will need to be generated to make sure the build does not -# fail and that the kernel may be used afterwards. -# -############################################################################### -ifndef CONFIG_MODULE_SIG_HASH -$(error Could not determine digest type to use from kernel config) -endif - -signing_key.priv signing_key.x509: x509.genkey - @echo "###" - @echo "### Now generating an X.509 key pair to be used for signing modules." - @echo "###" - @echo "### If this takes a long time, you might wish to run rngd in the" - @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and uses a hardware random" - @echo "### number generator if one is available." - @echo "###" - openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ - -batch -x509 -config x509.genkey \ - -outform DER -out signing_key.x509 \ - -keyout signing_key.priv 2>&1 - @echo "###" - @echo "### Key pair generated." - @echo "###" - -x509.genkey: - @echo Generating X.509 key generation config - @echo >x509.genkey "[ req ]" - @echo >>x509.genkey "default_bits = 4096" - @echo >>x509.genkey "distinguished_name = req_distinguished_name" - @echo >>x509.genkey "prompt = no" - @echo >>x509.genkey "string_mask = utf8only" - @echo >>x509.genkey "x509_extensions = myexts" - @echo >>x509.genkey - @echo >>x509.genkey "[ req_distinguished_name ]" - @echo >>x509.genkey "#O = Unspecified company" - @echo >>x509.genkey "CN = Build time autogenerated kernel key" - @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" - @echo >>x509.genkey - @echo >>x509.genkey "[ myexts ]" - @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" - @echo >>x509.genkey "keyUsage=digitalSignature" - @echo >>x509.genkey "subjectKeyIdentifier=hash" - @echo >>x509.genkey "authorityKeyIdentifier=keyid" -endif diff --git a/kernel/audit.c b/kernel/audit.c index f9e6065346db..662c007635fb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) + if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", diff --git a/kernel/audit.h b/kernel/audit.h index d641f9bb3ed0..dadf86a0e59e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -50,6 +50,7 @@ enum audit_state { /* Rule lists */ struct audit_watch; +struct audit_fsnotify_mark; struct audit_tree; struct audit_chunk; @@ -252,6 +253,7 @@ struct audit_net { extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; +extern int audit_del_rule(struct audit_entry *); extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; @@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); + +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern char *audit_mark_path(struct audit_fsnotify_mark *mark); +extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); +extern void audit_remove_mark_rule(struct audit_krule *krule); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); +extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); + #else #define audit_put_watch(w) {} #define audit_get_watch(w) {} @@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev #define audit_watch_path(w) "" #define audit_watch_compare(w, i, d) 0 +#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) +#define audit_mark_path(m) "" +#define audit_remove_mark(m) +#define audit_remove_mark_rule(k) +#define audit_mark_compare(m, i, d) 0 +#define audit_exe_compare(t, m) (-EINVAL) +#define audit_dupe_exe(n, o) (-EINVAL) #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c new file mode 100644 index 000000000000..27c6046c2c3d --- /dev/null +++ b/kernel/audit_fsnotify.c @@ -0,0 +1,216 @@ +/* audit_fsnotify.c -- tracking inodes + * + * Copyright 2003-2009,2014-2015 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/fsnotify_backend.h> +#include <linux/namei.h> +#include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/security.h> +#include "audit.h" + +/* + * this mark lives on the parent directory of the inode in question. + * but dev, ino, and path are about the child + */ +struct audit_fsnotify_mark { + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + char *path; /* insertion path */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ + struct audit_krule *rule; +}; + +/* fsnotify handle. */ +static struct fsnotify_group *audit_fsnotify_group; + +/* fsnotify events we care about. */ +#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) + +static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) +{ + kfree(audit_mark->path); + kfree(audit_mark); +} + +static void audit_fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct audit_fsnotify_mark *audit_mark; + + audit_mark = container_of(mark, struct audit_fsnotify_mark, mark); + audit_fsnotify_mark_free(audit_mark); +} + +char *audit_mark_path(struct audit_fsnotify_mark *mark) +{ + return mark->path; +} + +int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +{ + if (mark->ino == AUDIT_INO_UNSET) + return 0; + return (mark->ino == ino) && (mark->dev == dev); +} + +static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, + struct inode *inode) +{ + audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; + audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; +} + +struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) +{ + struct audit_fsnotify_mark *audit_mark; + struct path path; + struct dentry *dentry; + struct inode *inode; + int ret; + + if (pathname[0] != '/' || pathname[len-1] == '/') + return ERR_PTR(-EINVAL); + + dentry = kern_path_locked(pathname, &path); + if (IS_ERR(dentry)) + return (void *)dentry; /* returning an error */ + inode = path.dentry->d_inode; + mutex_unlock(&inode->i_mutex); + + audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); + if (unlikely(!audit_mark)) { + audit_mark = ERR_PTR(-ENOMEM); + goto out; + } + + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + audit_mark->mark.mask = AUDIT_FS_EVENTS; + audit_mark->path = pathname; + audit_update_mark(audit_mark, dentry->d_inode); + audit_mark->rule = krule; + + ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + if (ret < 0) { + audit_fsnotify_mark_free(audit_mark); + audit_mark = ERR_PTR(ret); + } +out: + dput(dentry); + path_put(&path); + return audit_mark; +} + +static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op) +{ + struct audit_buffer *ab; + struct audit_krule *rule = audit_mark->rule; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "auid=%u ses=%u op=", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, audit_mark->path); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + +void audit_remove_mark(struct audit_fsnotify_mark *audit_mark) +{ + fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group); + fsnotify_put_mark(&audit_mark->mark); +} + +void audit_remove_mark_rule(struct audit_krule *krule) +{ + struct audit_fsnotify_mark *mark = krule->exe; + + audit_remove_mark(mark); +} + +static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) +{ + struct audit_krule *rule = audit_mark->rule; + struct audit_entry *entry = container_of(rule, struct audit_entry, rule); + + audit_mark_log_rule_change(audit_mark, "autoremove_rule"); + audit_del_rule(entry); +} + +/* Update mark data in audit rules based on fsnotify events. */ +static int audit_mark_handle_event(struct fsnotify_group *group, + struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *dname, u32 cookie) +{ + struct audit_fsnotify_mark *audit_mark; + struct inode *inode = NULL; + + audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); + + BUG_ON(group != audit_fsnotify_group); + + switch (data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = ((struct path *)data)->dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = (struct inode *)data; + break; + default: + BUG(); + return 0; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + return 0; + audit_update_mark(audit_mark, inode); + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + audit_autoremove_mark_rule(audit_mark); + + return 0; +} + +static const struct fsnotify_ops audit_mark_fsnotify_ops = { + .handle_event = audit_mark_handle_event, +}; + +static int __init audit_fsnotify_init(void) +{ + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + if (IS_ERR(audit_fsnotify_group)) { + audit_fsnotify_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } + return 0; +} +device_initcall(audit_fsnotify_init); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0f9877273fc..94ecdabda8e6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ audit_tree_log_remove_rule(rule); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6e30024d9aac..656c7e93ac0d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch) int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return (watch->ino != (unsigned long)-1) && + return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && (watch->dev == dev); } @@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path) INIT_LIST_HEAD(&watch->rules); atomic_set(&watch->count, 1); watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; + watch->dev = AUDIT_DEV_UNSET; + watch->ino = AUDIT_INO_UNSET; return watch; } @@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) if (IS_ERR(watch)) return PTR_ERR(watch); - audit_get_watch(watch); krule->watch = watch; return 0; @@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent, list_replace(&oentry->rule.list, &nentry->rule.list); } + if (oentry->rule.exe) + audit_remove_mark(oentry->rule.exe); audit_watch_log_rule_change(r, owatch, "updated_rules"); @@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); audit_watch_log_rule_change(r, w, "remove_rule"); + if (e->rule.exe) + audit_remove_mark(e->rule.exe); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); @@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule, watch_found = 1; - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); + /* put krule's ref to temporary watch */ audit_put_watch(watch); audit_get_watch(w); krule->watch = watch = w; + + audit_put_parent(parent); break; } if (!watch_found) { - audit_get_parent(parent); watch->parent = parent; + audit_get_watch(watch); list_add(&watch->wlist, &parent->watches); } list_add(&krule->rlist, &watch->rules); @@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - /* match get in audit_find_parent or audit_init_parent */ - audit_put_parent(parent); - h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: @@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); @@ -517,3 +518,36 @@ static int __init audit_watch_init(void) return 0; } device_initcall(audit_watch_init); + +int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) +{ + struct audit_fsnotify_mark *audit_mark; + char *pathname; + + pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); + if (IS_ERR(audit_mark)) { + kfree(pathname); + return PTR_ERR(audit_mark); + } + new->exe = audit_mark; + + return 0; +} + +int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) +{ + struct file *exe_file; + unsigned long ino; + dev_t dev; + + rcu_read_lock(); + exe_file = rcu_dereference(tsk->mm->exe_file); + ino = exe_file->f_inode->i_ino; + dev = exe_file->f_inode->i_sb->s_dev; + rcu_read_unlock(); + return audit_mark_compare(mark, ino, dev); +} diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 72e1660a79a3..7714d93edb85 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_EXE: + if (f->op != Audit_equal) + return -EINVAL; + if (entry->rule.listnr != AUDIT_FILTER_EXIT) + return -EINVAL; + break; }; return 0; } @@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; + struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) @@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; + case AUDIT_EXE: + if (entry->rule.exe || f->val > PATH_MAX) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) { + err = PTR_ERR(str); + goto exit_free; + } + entry->rule.buflen += f->val; + + audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + if (IS_ERR(audit_mark)) { + kfree(str); + err = PTR_ERR(audit_mark); + goto exit_free; + } + entry->rule.exe = audit_mark; + break; } } @@ -549,10 +574,10 @@ exit_nofree: return entry; exit_free: - if (entry->rule.watch) - audit_put_watch(entry->rule.watch); /* matches initial get */ if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } @@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_EXE: + data->buflen += data->values[i] = + audit_pack_string(&bufp, audit_mark_path(krule->exe)); + break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; @@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_EXE: + /* both paths exist based on above type compare */ + if (strcmp(audit_mark_path(a->exe), + audit_mark_path(b->exe))) + return 1; + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) err = -ENOMEM; else new->filterkey = fk; + break; + case AUDIT_EXE: + err = audit_dupe_exe(new, old); + break; } if (err) { + if (new->exe) + audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } @@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int err; + int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); - goto error; + return err; } if (watch) { @@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry) */ if (tree) audit_put_tree(tree); - goto error; + return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); - goto error; + return err; } } @@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - return 0; - -error: - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ return err; } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry) +int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; @@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { - mutex_unlock(&audit_filter_mutex); ret = -ENOENT; goto out; } @@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry) if (e->rule.tree) audit_remove_tree_rule(&e->rule); - list_del_rcu(&e->list); - list_del(&e->rule.list); - call_rcu(&e->rcu, audit_free_rule_rcu); + if (e->rule.exe) + audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) @@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry) if (!audit_match_signal(entry)) audit_signals--; #endif - mutex_unlock(&audit_filter_mutex); + + list_del_rcu(&e->list); + list_del(&e->rule.list); + call_rcu(&e->rcu, audit_free_rule_rcu); out: - if (watch) - audit_put_watch(watch); /* match initial get */ + mutex_unlock(&audit_filter_mutex); + if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, WARN_ON(1); } - if (err || type == AUDIT_DEL_RULE) + if (err || type == AUDIT_DEL_RULE) { + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); audit_free_rule(entry); + } return err; } @@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r) return 0; nentry = audit_dupe_rule(r); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e85bdfd15fed..b86cc04959de 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val) return 0; list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && + if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } @@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(ctx->ppid, f->op, f->val); } break; + case AUDIT_EXE: + result = audit_exe_compare(tsk, rule->exe); + break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; @@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, aname->should_free = true; } - aname->ino = (unsigned long)-1; + aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); @@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent, if (inode) audit_copy_inode(found_child, dentry, inode); else - found_child->ino = (unsigned long)-1; + found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/memremap.c b/kernel/memremap.c new file mode 100644 index 000000000000..72b0c66628b6 --- /dev/null +++ b/kernel/memremap.c @@ -0,0 +1,190 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/memory_hotplug.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: either MEMREMAP_WB or MEMREMAP_WT + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. + * + * MEMREMAP_WB - matches the default mapping for "System RAM" on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map "System RAM" with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, "System RAM"); + void *addr = NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + flags &= ~MEMREMAP_WB; + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in "System RAM" + */ + if (is_ram == REGION_INTERSECTS) + addr = __va(offset); + else + addr = ioremap_cache(offset, size); + } + + /* + * If we don't have a mapping yet and more request flags are + * pending then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * "System RAM" + */ + if (!addr && is_ram == REGION_INTERSECTS && flags) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) { + flags &= ~MEMREMAP_WT; + addr = ioremap_wt(offset, size); + } + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, + addr)); + memunmap(addr); +} +EXPORT_SYMBOL(devm_memunmap); + +#ifdef CONFIG_ZONE_DEVICE +struct page_map { + struct resource res; +}; + +static void devm_memremap_pages_release(struct device *dev, void *res) +{ + struct page_map *page_map = res; + + /* pages are dead and unused, undo the arch mapping */ + arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); +} + +void *devm_memremap_pages(struct device *dev, struct resource *res) +{ + int is_ram = region_intersects(res->start, resource_size(res), + "System RAM"); + struct page_map *page_map; + int error, nid; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, res); + return ERR_PTR(-ENXIO); + } + + if (is_ram == REGION_INTERSECTS) + return __va(res->start); + + page_map = devres_alloc(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL); + if (!page_map) + return ERR_PTR(-ENOMEM); + + memcpy(&page_map->res, res, sizeof(*res)); + + nid = dev_to_node(dev); + if (nid < 0) + nid = 0; + + error = arch_add_memory(nid, res->start, resource_size(res), true); + if (error) { + devres_free(page_map); + return ERR_PTR(error); + } + + devres_add(dev, page_map); + return __va(res->start); +} +EXPORT_SYMBOL(devm_memremap_pages); +#endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module_signing.c b/kernel/module_signing.c index be5b8fac4bd0..bd62f5cda746 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -10,11 +10,8 @@ */ #include <linux/kernel.h> -#include <linux/err.h> -#include <crypto/public_key.h> -#include <crypto/hash.h> -#include <keys/asymmetric-type.h> #include <keys/system_keyring.h> +#include <crypto/public_key.h> #include "module-internal.h" /* @@ -28,170 +25,22 @@ * - Information block */ struct module_signature { - u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum hash_algo] */ - u8 id_type; /* Key identifier type [enum pkey_id_type] */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ u8 __pad[3]; __be32 sig_len; /* Length of signature data */ }; /* - * Digest the module contents. - */ -static struct public_key_signature *mod_make_digest(enum hash_algo hash, - const void *mod, - unsigned long modlen) -{ - struct public_key_signature *pks; - struct crypto_shash *tfm; - struct shash_desc *desc; - size_t digest_size, desc_size; - int ret; - - pr_devel("==>%s()\n", __func__); - - /* Allocate the hashing algorithm we're going to need and find out how - * big the hash operational data will be. - */ - tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); - if (IS_ERR(tfm)) - return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - digest_size = crypto_shash_digestsize(tfm); - - /* We allocate the hash operational data storage on the end of our - * context data and the digest output buffer on the end of that. - */ - ret = -ENOMEM; - pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); - if (!pks) - goto error_no_pks; - - pks->pkey_hash_algo = hash; - pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; - pks->digest_size = digest_size; - - desc = (void *)pks + sizeof(*pks); - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto error; - - ret = crypto_shash_finup(desc, mod, modlen, pks->digest); - if (ret < 0) - goto error; - - crypto_free_shash(tfm); - pr_devel("<==%s() = ok\n", __func__); - return pks; - -error: - kfree(pks); -error_no_pks: - crypto_free_shash(tfm); - pr_devel("<==%s() = %d\n", __func__, ret); - return ERR_PTR(ret); -} - -/* - * Extract an MPI array from the signature data. This represents the actual - * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the - * size of the MPI in bytes. - * - * RSA signatures only have one MPI, so currently we only read one. - */ -static int mod_extract_mpi_array(struct public_key_signature *pks, - const void *data, size_t len) -{ - size_t nbytes; - MPI mpi; - - if (len < 3) - return -EBADMSG; - nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; - data += 2; - len -= 2; - if (len != nbytes) - return -EBADMSG; - - mpi = mpi_read_raw_data(data, nbytes); - if (!mpi) - return -ENOMEM; - pks->mpi[0] = mpi; - pks->nr_mpi = 1; - return 0; -} - -/* - * Request an asymmetric key. - */ -static struct key *request_asymmetric_key(const char *signer, size_t signer_len, - const u8 *key_id, size_t key_id_len) -{ - key_ref_t key; - size_t i; - char *id, *q; - - pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); - - /* Construct an identifier. */ - id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); - if (!id) - return ERR_PTR(-ENOKEY); - - memcpy(id, signer, signer_len); - - q = id + signer_len; - *q++ = ':'; - *q++ = ' '; - for (i = 0; i < key_id_len; i++) { - *q++ = hex_asc[*key_id >> 4]; - *q++ = hex_asc[*key_id++ & 0x0f]; - } - - *q = 0; - - pr_debug("Look up: \"%s\"\n", id); - - key = keyring_search(make_key_ref(system_trusted_keyring, 1), - &key_type_asymmetric, id); - if (IS_ERR(key)) - pr_warn("Request for unknown module key '%s' err %ld\n", - id, PTR_ERR(key)); - kfree(id); - - if (IS_ERR(key)) { - switch (PTR_ERR(key)) { - /* Hide some search errors */ - case -EACCES: - case -ENOTDIR: - case -EAGAIN: - return ERR_PTR(-ENOKEY); - default: - return ERR_CAST(key); - } - } - - pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); - return key_ref_to_ptr(key); -} - -/* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, unsigned long *_modlen) { - struct public_key_signature *pks; struct module_signature ms; - struct key *key; - const void *sig; size_t modlen = *_modlen, sig_len; - int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - if ((size_t)ms.signer_len + ms.key_id_len >= modlen) - return -EBADMSG; - modlen -= (size_t)ms.signer_len + ms.key_id_len; - *_modlen = modlen; - sig = mod + modlen; - - /* For the moment, only support RSA and X.509 identifiers */ - if (ms.algo != PKEY_ALGO_RSA || - ms.id_type != PKEY_ID_X509) - return -ENOPKG; - if (ms.hash >= PKEY_HASH__LAST || - !hash_algo_name[ms.hash]) + if (ms.id_type != PKEY_ID_PKCS7) { + pr_err("Module is not signed with expected PKCS#7 message\n"); return -ENOPKG; - - key = request_asymmetric_key(sig, ms.signer_len, - sig + ms.signer_len, ms.key_id_len); - if (IS_ERR(key)) - return PTR_ERR(key); - - pks = mod_make_digest(ms.hash, mod, modlen); - if (IS_ERR(pks)) { - ret = PTR_ERR(pks); - goto error_put_key; } - ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, - sig_len); - if (ret < 0) - goto error_free_pks; - - ret = verify_signature(key, pks); - pr_devel("verify_signature() = %d\n", ret); + if (ms.algo != 0 || + ms.hash != 0 || + ms.signer_len != 0 || + ms.key_id_len != 0 || + ms.__pad[0] != 0 || + ms.__pad[1] != 0 || + ms.__pad[2] != 0) { + pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + return -EBADMSG; + } -error_free_pks: - mpi_free(pks->rsa.s); - kfree(pks); -error_put_key: - key_put(key); - pr_devel("<==%s() = %d\n", __func__, ret); - return ret; + return system_verify_data(mod, modlen, mod + modlen, sig_len, + VERIFYING_MODULE_SIGNATURE); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36a..787320de68e0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); diff --git a/kernel/resource.c b/kernel/resource.c index fed052a1bc9f..f150dbbe6f62 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -/* - * Search for a resouce entry that fully contains the specified region. - * If found, return 1 if it is RAM, 0 if not. - * If not found, or region is not fully contained, return -1 +/** + * region_intersects() - determine intersection of region with known resources + * @start: region start address + * @size: size of region + * @name: name of resource (in iomem_resource) * - * Used by the ioremap functions to ensure the user is not remapping RAM and is - * a vast speed up over walking through the resource table page by page. + * Check if the specified region partially overlaps or fully eclipses a + * resource identified by @name. Return REGION_DISJOINT if the region + * does not overlap @name, return REGION_MIXED if the region overlaps + * @type and another resource, and return REGION_INTERSECTS if the + * region overlaps @type and no other defined resource. Note, that + * REGION_INTERSECTS is also returned in the case when the specified + * region overlaps RAM and undefined memory holes. + * + * region_intersect() is used by memory remapping functions to ensure + * the user is not remapping RAM and is a vast speed up over walking + * through the resource table page by page. */ -int region_is_ram(resource_size_t start, unsigned long size) +int region_intersects(resource_size_t start, size_t size, const char *name) { - struct resource *p; - resource_size_t end = start + size - 1; unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - const char *name = "System RAM"; - int ret = -1; + resource_size_t end = start + size - 1; + int type = 0; int other = 0; + struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (p->end < start) - continue; - - if (p->start <= start && end <= p->end) { - /* resource fully contains region */ - if ((p->flags != flags) || strcmp(p->name, name)) - ret = 0; - else - ret = 1; - break; - } - if (end < p->start) - break; /* not found */ + bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + + if (start >= p->start && start <= p->end) + is_type ? type++ : other++; + if (end >= p->start && end <= p->end) + is_type ? type++ : other++; + if (p->start >= start && p->end <= end) + is_type ? type++ : other++; } read_unlock(&resource_lock); - return ret; + + if (other == 0) + return type ? REGION_INTERSECTS : REGION_DISJOINT; + + if (type) + return REGION_MIXED; + + return REGION_DISJOINT; } void __weak arch_remove_reservations(struct resource *avail) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 245df6b32b81..5bd4779282df 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; @@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (mode == 0) + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); @@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S deleted file mode 100644 index 3e9868d47535..000000000000 --- a/kernel/system_certificates.S +++ /dev/null @@ -1,20 +0,0 @@ -#include <linux/export.h> -#include <linux/init.h> - - __INITRODATA - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list) -VMLINUX_SYMBOL(system_certificate_list): -__cert_list_start: - .incbin "kernel/x509_certificate_list" -__cert_list_end: - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list_size) -VMLINUX_SYMBOL(system_certificate_list_size): -#ifdef CONFIG_64BIT - .quad __cert_list_end - __cert_list_start -#else - .long __cert_list_end - __cert_list_start -#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c deleted file mode 100644 index 875f64e8935b..000000000000 --- a/kernel/system_keyring.c +++ /dev/null @@ -1,106 +0,0 @@ -/* System trusted keyring for trusted public keys - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> -#include "module-internal.h" - -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); - -extern __initconst const u8 system_certificate_list[]; -extern __initconst const unsigned long system_certificate_list_size; - -/* - * Load the compiled-in keys - */ -static __init int system_trusted_keyring_init(void) -{ - pr_notice("Initialise system trusted keyring\n"); - - system_trusted_keyring = - keyring_alloc(".system_keyring", - KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(system_trusted_keyring)) - panic("Can't allocate system trusted keyring\n"); - - set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(system_trusted_keyring_init); - -/* - * Load the compiled-in list of X.509 certificates. - */ -static __init int load_system_certificate_list(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading compiled-in X.509 certificates\n"); - - p = system_certificate_list; - end = p + system_certificate_list_size; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_TRUSTED); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_system_certificate_list); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb11011b5292..b0623ac785a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -630,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..fc347f8b1bca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -400,6 +400,17 @@ struct rb_irq_work { }; /* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + +/* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 @@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) -{ - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - struct buffer_page *tail_page; - struct ring_buffer_event *event; - unsigned long tail, write; + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + return skip_time_extend(event); +} - /* set write to only the index of the write */ - write &= RB_WRITE_MASK; - tail = write - length; +static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - if (!tail) + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; delta = 0; + } - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} - /* We reserved something on the buffer */ +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + /* zero length can cause confusions */ + if (!length) + length++; - local_inc(&tail_page->entries); + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* - * If this is the first commit on the page, then update - * its timestamp. + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). */ - if (!tail) - tail_page->page->time_stamp = ts; + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; - /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + return length; +} - return event; +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; } +#endif static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, @@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } } -static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, - struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) +static inline void rb_event_discard(struct ring_buffer_event *event) { - struct ring_buffer_event *event; - u64 ts, delta; - int nr_loops = 0; - int add_timestamp; - u64 diff; + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); - rb_start_commit(cpu_buffer); + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { - local_dec(&cpu_buffer->committing); - local_dec(&cpu_buffer->commits); - return NULL; - } -#endif +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; - length = rb_calculate_event_length(length); - again: - add_timestamp = 0; - delta = 0; + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; /* - * We allow for interrupts to reenter here and do a trace. - * If one does, it will cause this original code to loop - * back here. Even with heavy interrupts happening, this - * should only happen a few times in a row. If this happens - * 1000 times in a row, there must be either an interrupt - * storm or we have something buggy. - * Bail! + * The event first in the commit queue updates the + * time stamp. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - goto out_fail; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} - /* make sure this diff is calculated here */ - barrier(); +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; - /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; - } + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); - if (unlikely(PTR_ERR(event) == -EAGAIN)) - goto again; - - if (!event) - goto out_fail; + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } - return event; + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - out_fail: - rb_end_commit(cpu_buffer); - return NULL; + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /* @@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + struct ring_buffer_event *event; + struct buffer_page *tail_page; + unsigned long tail, write; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; + + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - info->length; + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + info->delta = 0; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, tail, info); + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = info->ts; + + /* account for these added bytes */ + local_add(info->length, &cpu_buffer->entries_bytes); + + return event; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + struct rb_event_info info; + int nr_loops = 0; + u64 diff; + + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + info.length = rb_calculate_event_length(length); + again: + info.add_timestamp = 0; + info.delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); + } + + event = __rb_reserve_next(cpu_buffer, &info); + + if (unlikely(PTR_ERR(event) == -EAGAIN)) + goto again; + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +/** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) @@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..6e79408674aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; @@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 404a372ad85a..7ca09cdc20c2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -30,6 +30,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + field = __find_event_field(&ftrace_common_fields, name); if (field) return field; @@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, } EXPORT_SYMBOL_GPL(trace_define_field); +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type); \ + if (ret) \ + return ret; + #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ @@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, cpu, FILTER_OTHER); + __generic_field(char *, comm, FILTER_PTR_STRING); + + return ret; +} + static int trace_define_common_fields(void) { int ret; @@ -2671,6 +2693,9 @@ static __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d81d6f302b14..bd1bf184c5c9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + int match = 0; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + match = cpu == cmp; + break; + case OP_LT: + match = cpu < cmp; + break; + case OP_LE: + match = cpu <= cmp; + break; + case OP_GT: + match = cpu > cmp; + break; + case OP_GE: + match = cpu >= cmp; + break; + default: + break; + } + + return !!match == !pred->not; +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp, match; + + cmp = pred->regex.match(current->comm, &pred->regex, + pred->regex.field_len); + match = cmp ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; @@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps, if (is_string_field(field)) { filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) { + if (!strcmp(field->name, "comm")) { + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - fn = select_comparison_fn(pred->op, field->size, + if (!strcmp(field->name, "cpu")) + fn = filter_pred_cpu; + else + fn = select_comparison_fn(pred->op, field->size, field->is_signed); if (!fn) { parse_error(ps, FILT_ERR_INVALID_OP, 0); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..ca98445782ac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..8e481a84aeea 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -496,6 +496,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..b746399ab59c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,12 +18,6 @@ #define STACK_TRACE_ENTRIES 500 -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; @@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; */ static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; static unsigned long max_stack_size; @@ -55,7 +49,7 @@ static inline void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); for (i = 0; i < max_stack_trace.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) @@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) while (i < max_stack_trace.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_dump_index[x] = this_size; p = start; for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_dump_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); max_stack_size -= tracer_frame; @@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + max_stack_trace.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); @@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); |