summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditsc.c32
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/arraymap.c33
-rw-r--r--kernel/bpf/bpf_lru_list.h3
-rw-r--r--kernel/bpf/core.c63
-rw-r--r--kernel/bpf/devmap.c411
-rw-r--r--kernel/bpf/hashtab.c60
-rw-r--r--kernel/bpf/lpm_trie.c9
-rw-r--r--kernel/bpf/sockmap.c873
-rw-r--r--kernel/bpf/stackmap.c8
-rw-r--r--kernel/bpf/syscall.c144
-rw-r--r--kernel/bpf/tnum.c180
-rw-r--r--kernel/bpf/verifier.c2472
-rw-r--r--kernel/cgroup/cgroup-internal.h12
-rw-r--r--kernel/cgroup/cgroup-v1.c75
-rw-r--r--kernel/cgroup/cgroup.c973
-rw-r--r--kernel/cgroup/cpuset.c64
-rw-r--r--kernel/cgroup/debug.c53
-rw-r--r--kernel/cgroup/freezer.c6
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/cpu.c506
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/events/ring_buffer.c20
-rw-r--r--kernel/exit.c27
-rw-r--r--kernel/extable.c45
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/irqdesc.c24
-rw-r--r--kernel/irq/irqdomain.c15
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/msi.c5
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/kcov.c1
-rw-r--r--kernel/kmod.c563
-rw-r--r--kernel/locking/rtmutex-debug.c2
-rw-r--r--kernel/locking/rtmutex.c35
-rw-r--r--kernel/locking/rtmutex_common.h12
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/memremap.c116
-rw-r--r--kernel/module.c12
-rw-r--r--kernel/params.c35
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/hibernate.c29
-rw-r--r--kernel/power/main.c64
-rw-r--r--kernel/power/power.h5
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/suspend.c184
-rw-r--r--kernel/power/suspend_test.c4
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/printk/printk.c72
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcu/tree.c12
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/sched/core.c35
-rw-r--r--kernel/sched/cpufreq_schedutil.c98
-rw-r--r--kernel/sched/deadline.c52
-rw-r--r--kernel/sched/debug.c9
-rw-r--r--kernel/sched/fair.c64
-rw-r--r--kernel/sched/idle.c8
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h21
-rw-r--r--kernel/sched/topology.c6
-rw-r--r--kernel/sched/wait.c85
-rw-r--r--kernel/seccomp.c344
-rw-r--r--kernel/signal.c72
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c5
-rw-r--r--kernel/sysctl_binary.c21
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timekeeping_debug.c5
-rw-r--r--kernel/trace/blktrace.c279
-rw-r--r--kernel/trace/ftrace.c82
-rw-r--r--kernel/trace/trace.c32
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_events.c17
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_stack.c15
-rw-r--r--kernel/trace/trace_syscalls.c53
-rw-r--r--kernel/umh.c568
-rw-r--r--kernel/user_namespace.c20
-rw-r--r--kernel/workqueue.c6
96 files changed, 6691 insertions, 2743 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9c323a6daa46..ed470aac53da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,13 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o
+obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/acct.c b/kernel/acct.c
index 5b1284370367..5e72af29ab73 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -516,7 +516,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
if (file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
- __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ __kernel_write(file, &ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
diff --git a/kernel/audit.c b/kernel/audit.c
index 6dd556931739..be1c28fd4d57 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1662,7 +1662,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- ktime_get_real_ts64(t);
+ *t = current_kernel_time64();
*serial = audit_serial();
}
}
@@ -1833,7 +1833,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
}
/**
- * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * audit_log_n_hex - convert a buffer to hex and append it to the audit skb
* @ab: the audit_buffer
* @buf: buffer to convert to hex
* @len: length of @buf to be converted
diff --git a/kernel/audit.h b/kernel/audit.h
index b331d9b83f63..9b110ae17ee3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -182,7 +182,7 @@ struct audit_context {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
- struct timespec abs_timeout;
+ struct timespec64 abs_timeout;
} mq_sendrecv;
struct {
int oflag;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3260ba2312a9..ecc23e25c9eb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic)
case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
- "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ "abs_timeout_sec=%lld abs_timeout_nsec=%ld",
context->mq_sendrecv.mqdes,
context->mq_sendrecv.msg_len,
context->mq_sendrecv.msg_prio,
- context->mq_sendrecv.abs_timeout.tv_sec,
+ (long long) context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
break;
case AUDIT_MQ_NOTIFY:
@@ -1462,7 +1462,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
/**
- * audit_free - free a per-task audit context
+ * __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
* Called from copy_process and do_exit
@@ -1489,7 +1489,7 @@ void __audit_free(struct task_struct *tsk)
}
/**
- * audit_syscall_entry - fill in an audit record at syscall entry
+ * __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
* @a2: additional syscall register 2
@@ -1536,14 +1536,14 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
return;
context->serial = 0;
- ktime_get_real_ts64(&context->ctime);
+ context->ctime = current_kernel_time64();
context->in_syscall = 1;
context->current_state = state;
context->ppid = 0;
}
/**
- * audit_syscall_exit - deallocate audit context after a system call
+ * __audit_syscall_exit - deallocate audit context after a system call
* @success: success value of the syscall
* @return_code: return value of the syscall
*
@@ -1705,7 +1705,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
}
/**
- * audit_reusename - fill out filename with info from existing entry
+ * __audit_reusename - fill out filename with info from existing entry
* @uptr: userland ptr to pathname
*
* Search the audit_names list for the current audit context. If there is an
@@ -1730,7 +1730,7 @@ __audit_reusename(const __user char *uptr)
}
/**
- * audit_getname - add a name to the list
+ * __audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
@@ -2083,15 +2083,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
*
*/
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
- const struct timespec *abs_timeout)
+ const struct timespec64 *abs_timeout)
{
struct audit_context *context = current->audit_context;
- struct timespec *p = &context->mq_sendrecv.abs_timeout;
+ struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
- memcpy(p, abs_timeout, sizeof(struct timespec));
+ memcpy(p, abs_timeout, sizeof(*p));
else
- memset(p, 0, sizeof(struct timespec));
+ memset(p, 0, sizeof(*p));
context->mq_sendrecv.mqdes = mqdes;
context->mq_sendrecv.msg_len = msg_len;
@@ -2135,7 +2135,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
}
/**
- * audit_ipc_obj - record audit data for ipc object
+ * __audit_ipc_obj - record audit data for ipc object
* @ipcp: ipc permissions
*
*/
@@ -2151,7 +2151,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
}
/**
- * audit_ipc_set_perm - record audit data for new ipc permissions
+ * __audit_ipc_set_perm - record audit data for new ipc permissions
* @qbytes: msgq bytes
* @uid: msgq user id
* @gid: msgq group id
@@ -2180,7 +2180,7 @@ void __audit_bprm(struct linux_binprm *bprm)
/**
- * audit_socketcall - record audit data for sys_socketcall
+ * __audit_socketcall - record audit data for sys_socketcall
* @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
@@ -2211,7 +2211,7 @@ void __audit_fd_pair(int fd1, int fd2)
}
/**
- * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
* @len: data length in user space
* @a: data address in kernel space
*
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e658f2db..897daa005b23 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,13 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+ifeq ($(CONFIG_STREAM_PARSER),y)
+obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
+endif
+endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d771a3872500..98c0f00c3f5e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_array *array;
u64 array_size;
u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags)
+ attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ (percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size);
+ array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
@@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
array->map.map_flags = attr->map_flags;
+ array->map.numa_node = numa_node;
array->elem_size = elem_size;
if (!percpu)
@@ -603,6 +606,31 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 array_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ u32 elem_size = round_up(map->value_size, 8);
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (is_power_of_2(elem_size))
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ else
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+
+ return insn - insn_buf;
+}
+
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
@@ -612,4 +640,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = array_of_map_gen_lookup,
};
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 5c35a98d02bf..7d4f89b7cb84 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
/* ref is an approximation on access frequency. It does not
* have to be very accurate. Hence, no protection is used.
*/
- node->ref = 1;
+ if (!node->ref)
+ node->ref = 1;
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad5f55922a13..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
/* Accommodate for extra offset in case of a backjump. */
off = from->off;
@@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -1010,7 +1022,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -1073,6 +1085,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -1085,6 +1109,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -1097,6 +1133,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -1109,6 +1157,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
@@ -1378,6 +1438,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..e093d9a2c4dd
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,411 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * When removing the dev a cmpxchg() is used to ensure the correct dev is
+ * removed, in the case of a concurrent update or delete operation it is
+ * possible that the initially referenced dev is no longer in the map. As the
+ * notifier hook walks the map we know that new dev references can not be
+ * added by the user because core infrastructure ensures dev_get_by_index()
+ * calls will fail at this point.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+struct bpf_dtab_netdev {
+ struct net_device *dev;
+ struct bpf_dtab *dtab;
+ unsigned int bit;
+ struct rcu_head rcu;
+};
+
+struct bpf_dtab {
+ struct bpf_map map;
+ struct bpf_dtab_netdev **netdev_map;
+ unsigned long __percpu *flush_needed;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_map_lock);
+static LIST_HEAD(dev_map_list);
+
+static u64 dev_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ int err = -EINVAL;
+ u64 cost;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ dtab->map.map_type = attr->map_type;
+ dtab->map.key_size = attr->key_size;
+ dtab->map.value_size = attr->value_size;
+ dtab->map.max_entries = attr->max_entries;
+ dtab->map.map_flags = attr->map_flags;
+ dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_dtab;
+
+ dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(dtab->map.pages);
+ if (err)
+ goto free_dtab;
+
+ err = -ENOMEM;
+
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
+ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *),
+ dtab->map.numa_node);
+ if (!dtab->netdev_map)
+ goto free_dtab;
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+
+ return &dtab->map;
+free_dtab:
+ free_percpu(dtab->flush_needed);
+ kfree(dtab);
+ return ERR_PTR(err);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int i, cpu;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further reads against netdev_map. It does __not__ ensure pending
+ * flush operations (if any) are complete.
+ */
+
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cond_resched();
+ }
+
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
+
+ dev = dtab->netdev_map[i];
+ if (!dev)
+ continue;
+
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ free_percpu(dtab->flush_needed);
+ bpf_map_area_free(dtab->netdev_map);
+ kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= dtab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == dtab->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ __clear_bit(bit, bitmap);
+ netdev = dev->dev;
+ if (likely(netdev->netdev_ops->ndo_xdp_flush))
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ dev = READ_ONCE(dtab->netdev_map[key]);
+ return dev ? dev->dev : NULL;
+}
+
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+
+ return dev ? &dev->ifindex : NULL;
+}
+
+static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
+{
+ if (dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
+ __clear_bit(dev->bit, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(dev->dev);
+ }
+ }
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab_netdev *dev;
+
+ dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(dev);
+ dev_put(dev->dev);
+ kfree(dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ /* Use call_rcu() here to ensure any rcu critical sections have
+ * completed, but this does not guarantee a flush has happened
+ * yet. Because driver side rcu_read_lock/unlock only protects the
+ * running XDP program. However, for pending flush operations the
+ * dev and ctx are stored in another per cpu map. And additionally,
+ * the driver tear down ensures all soft irqs are complete before
+ * removing the net device in the case of dev_put equals zero.
+ */
+ old_dev = xchg(&dtab->netdev_map[k], NULL);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 i = *(u32 *)key;
+ u32 ifindex = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= dtab->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ if (!ifindex) {
+ dev = NULL;
+ } else {
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ map->numa_node);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ kfree(dev);
+ return -EINVAL;
+ }
+
+ dev->bit = i;
+ dev->dtab = dtab;
+ }
+
+ /* Use call_rcu() here to ensure rcu critical sections have completed
+ * Remembering the driver side flush operation will happen before the
+ * net device is removed.
+ */
+ old_dev = xchg(&dtab->netdev_map[i], dev);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_get_next_key,
+ .map_lookup_elem = dev_map_lookup_elem,
+ .map_update_elem = dev_map_update_elem,
+ .map_delete_elem = dev_map_delete_elem,
+};
+
+static int dev_map_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dtab *dtab;
+ int i;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* This rcu_read_lock/unlock pair is needed because
+ * dev_map_list is an RCU list AND to ensure a delete
+ * operation does not free a netdev_map entry while we
+ * are comparing it against the netdev being unregistered.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev, *odev;
+
+ dev = READ_ONCE(dtab->netdev_map[i]);
+ if (!dev ||
+ dev->dev->ifindex != netdev->ifindex)
+ continue;
+ odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ if (dev == odev)
+ call_rcu(&dev->rcu,
+ __dev_map_entry_free);
+ }
+ }
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+ .notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+ register_netdevice_notifier(&dev_map_notifier);
+ return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d11c8181f4c5..431126f31ea3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,6 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab)
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
num_entries += num_possible_cpus();
- htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+ htab->map.numa_node);
if (!htab->elems)
return -ENOMEM;
@@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_htab *htab;
int err, i;
u64 cost;
@@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
@@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (lru && !prealloc)
return ERR_PTR(-ENOTSUPP);
+ if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+ return ERR_PTR(-EINVAL);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
htab->map.map_flags = attr->map_flags;
+ htab->map.numa_node = numa_node;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
- sizeof(struct bucket));
+ sizeof(struct bucket),
+ htab->map.numa_node);
if (!htab->buckets)
goto free_htab;
@@ -504,6 +514,29 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int ref_reg = BPF_REG_1;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
+ *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
+ *insn++ = BPF_ST_MEM(BPF_B, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref),
+ 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -704,7 +737,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
atomic_dec(&htab->count);
return ERR_PTR(-E2BIG);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!l_new)
return ERR_PTR(-ENOMEM);
}
@@ -1126,6 +1160,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_gen_lookup = htab_lru_map_gen_lookup,
};
/* Called from eBPF program */
@@ -1315,6 +1350,22 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+
+ return insn - insn_buf;
+}
+
static void htab_of_map_free(struct bpf_map *map)
{
bpf_map_meta_free(map->inner_map_meta);
@@ -1330,4 +1381,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = htab_of_map_gen_lookup,
};
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b09185f0f17d..1b767844a76f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+ node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
+ trie->map.numa_node);
if (!node)
return NULL;
@@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
@@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 ||
- attr->map_flags != BPF_F_NO_PREALLOC ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
attr->key_size < LPM_KEY_SIZE_MIN ||
attr->key_size > LPM_KEY_SIZE_MAX ||
attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
trie->map.value_size = attr->value_size;
trie->map.max_entries = attr->max_entries;
trie->map.map_flags = attr->map_flags;
+ trie->map.numa_node = bpf_map_attr_numa_node(attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 000000000000..6424ce0e4969
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,873 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have BPF programs attached to it, currently a program
+ * used to parse packets and a program to provide a verdict and redirect
+ * decision on the packet are supported. Any programs attached to a sock
+ * map are inherited by sock objects when they are added to the map. If
+ * no BPF programs are attached the sock object may only be used for sock
+ * redirect.
+ *
+ * A sock object may be in multiple maps, but can only inherit a single
+ * parse or verdict program. If adding a sock object to a map would result
+ * in having multiple parsing programs the update will return an EBUSY error.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For an example please review
+ * ./samples/bpf/sockmap/.
+ */
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/strparser.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sock_map;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+};
+
+enum smap_psock_state {
+ SMAP_TX_RUNNING,
+};
+
+struct smap_psock_map_entry {
+ struct list_head list;
+ struct sock **entry;
+};
+
+struct smap_psock {
+ struct rcu_head rcu;
+ /* refcnt is used inside sk_callback_lock */
+ u32 refcnt;
+
+ /* datapath variables */
+ struct sk_buff_head rxqueue;
+ bool strp_enabled;
+
+ /* datapath error path cache across tx work invocations */
+ int save_rem;
+ int save_off;
+ struct sk_buff *save_skb;
+
+ struct strparser strp;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+ struct list_head maps;
+
+ /* Back reference used when sock callback trigger sockmap operations */
+ struct sock *sock;
+ unsigned long state;
+
+ struct work_struct tx_work;
+ struct work_struct gc_work;
+
+ void (*save_data_ready)(struct sock *sk);
+ void (*save_write_space)(struct sock *sk);
+ void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+ return rcu_dereference_sk_user_data(sk);
+}
+
+static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
+ int rc;
+
+ if (unlikely(!prog))
+ return SK_DROP;
+
+ skb_orphan(skb);
+ skb->sk = psock->sock;
+ bpf_compute_data_end(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+
+ return rc;
+}
+
+static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int rc;
+
+ /* Because we use per cpu values to feed input from sock redirect
+ * in BPF program to do_sk_redirect_map() call we need to ensure we
+ * are not preempted. RCU read lock is not sufficient in this case
+ * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
+ */
+ preempt_disable();
+ rc = smap_verdict_func(psock, skb);
+ switch (rc) {
+ case SK_REDIRECT:
+ sk = do_sk_redirect_map();
+ preempt_enable();
+ if (likely(sk)) {
+ struct smap_psock *peer = smap_psock_sk(sk);
+
+ if (likely(peer &&
+ test_bit(SMAP_TX_RUNNING, &peer->state) &&
+ !sock_flag(sk, SOCK_DEAD) &&
+ sock_writeable(sk))) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
+ }
+ }
+ /* Fall through and free skb otherwise */
+ case SK_DROP:
+ default:
+ if (rc != SK_REDIRECT)
+ preempt_enable();
+ kfree_skb(skb);
+ }
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+ struct sock *sk = psock->sock;
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+/* Called with lock_sock(sk) held */
+static void smap_state_change(struct sock *sk)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+ struct socket_wq *wq;
+ struct sock *osk;
+
+ rcu_read_lock();
+
+ /* Allowing transitions into an established syn_recv states allows
+ * for early binding sockets to a smap object before the connection
+ * is established.
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ case TCP_LISTEN:
+ break;
+ case TCP_CLOSE:
+ /* Only release if the map entry is in fact the sock in
+ * question. There is a case where the operator deletes
+ * the sock from the map, but the TCP sock is closed before
+ * the psock is detached. Use cmpxchg to verify correct
+ * sock is removed.
+ */
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ write_lock_bh(&sk->sk_callback_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ break;
+ default:
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ smap_report_sk_error(psock, EPIPE);
+ break;
+ }
+
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ smap_do_verdict(psock, skb);
+ rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock)) {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ rcu_read_unlock();
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+ struct smap_psock *psock;
+ struct sk_buff *skb;
+ int rem, off, n;
+
+ psock = container_of(w, struct smap_psock, tx_work);
+
+ /* lock sock to avoid losing sk_socket at some point during loop */
+ lock_sock(psock->sock);
+ if (psock->save_skb) {
+ skb = psock->save_skb;
+ rem = psock->save_rem;
+ off = psock->save_off;
+ psock->save_skb = NULL;
+ goto start;
+ }
+
+ while ((skb = skb_dequeue(&psock->rxqueue))) {
+ rem = skb->len;
+ off = 0;
+start:
+ do {
+ if (likely(psock->sock->sk_socket))
+ n = skb_send_sock_locked(psock->sock,
+ skb, off, rem);
+ else
+ n = -EINVAL;
+ if (n <= 0) {
+ if (n == -EAGAIN) {
+ /* Retry when space is available */
+ psock->save_skb = skb;
+ psock->save_rem = rem;
+ psock->save_off = off;
+ goto out;
+ }
+ /* Hard errors break pipe and stop xmit */
+ smap_report_sk_error(psock, n ? -n : EPIPE);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ kfree_skb(skb);
+ goto out;
+ }
+ rem -= n;
+ off += n;
+ } while (rem);
+ kfree_skb(skb);
+ }
+out:
+ release_sock(psock->sock);
+}
+
+static void smap_write_space(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
+ schedule_work(&psock->tx_work);
+ rcu_read_unlock();
+}
+
+static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (!psock->strp_enabled)
+ return;
+ sk->sk_data_ready = psock->save_data_ready;
+ sk->sk_write_space = psock->save_write_space;
+ sk->sk_state_change = psock->save_state_change;
+ psock->save_data_ready = NULL;
+ psock->save_write_space = NULL;
+ psock->save_state_change = NULL;
+ strp_stop(&psock->strp);
+ psock->strp_enabled = false;
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+ struct smap_psock *psock = container_of(rcu,
+ struct smap_psock, rcu);
+
+ /* Now that a grace period has passed there is no longer
+ * any reference to this sock in the sockmap so we can
+ * destroy the psock, strparser, and bpf programs. But,
+ * because we use workqueue sync operations we can not
+ * do it in rcu context
+ */
+ schedule_work(&psock->gc_work);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
+{
+ psock->refcnt--;
+ if (psock->refcnt)
+ return;
+
+ smap_stop_sock(psock, sock);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ rcu_assign_sk_user_data(sock, NULL);
+ call_rcu_sched(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+ struct bpf_prog *prog;
+ int rc;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ prog = READ_ONCE(psock->bpf_parse);
+
+ if (unlikely(!prog)) {
+ rcu_read_unlock();
+ return skb->len;
+ }
+
+ /* Attach socket for bpf program to use if needed we can do this
+ * because strparser clones the skb before handing it to a upper
+ * layer, meaning skb_orphan has been called. We NULL sk on the
+ * way out to ensure we don't trigger a BUG_ON in skb/sk operations
+ * later and because we are not charging the memory of this skb to
+ * any socket yet.
+ */
+ skb->sk = psock->sock;
+ bpf_compute_data_end(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+ rcu_read_unlock();
+ return rc;
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+ struct sock *sk)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = smap_read_sock_strparser,
+ .parse_msg = smap_parse_func_strparser,
+ .read_sock_done = smap_read_sock_done,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+static void smap_init_progs(struct smap_psock *psock,
+ struct bpf_stab *stab,
+ struct bpf_prog *verdict,
+ struct bpf_prog *parse)
+{
+ struct bpf_prog *orig_parse, *orig_verdict;
+
+ orig_parse = xchg(&psock->bpf_parse, parse);
+ orig_verdict = xchg(&psock->bpf_verdict, verdict);
+
+ if (orig_verdict)
+ bpf_prog_put(orig_verdict);
+ if (orig_parse)
+ bpf_prog_put(orig_parse);
+}
+
+static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (sk->sk_data_ready == smap_data_ready)
+ return;
+ psock->save_data_ready = sk->sk_data_ready;
+ psock->save_write_space = sk->sk_write_space;
+ psock->save_state_change = sk->sk_state_change;
+ sk->sk_data_ready = smap_data_ready;
+ sk->sk_write_space = smap_write_space;
+ sk->sk_state_change = smap_state_change;
+ psock->strp_enabled = true;
+}
+
+static void sock_map_remove_complete(struct bpf_stab *stab)
+{
+ bpf_map_area_free(stab->sock_map);
+ kfree(stab);
+}
+
+static void smap_gc_work(struct work_struct *w)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+
+ psock = container_of(w, struct smap_psock, gc_work);
+
+ /* no callback lock needed because we already detached sockmap ops */
+ if (psock->strp_enabled)
+ strp_done(&psock->strp);
+
+ cancel_work_sync(&psock->tx_work);
+ __skb_queue_purge(&psock->rxqueue);
+
+ /* At this point all strparser and xmit work must be complete */
+ if (psock->bpf_parse)
+ bpf_prog_put(psock->bpf_parse);
+ if (psock->bpf_verdict)
+ bpf_prog_put(psock->bpf_verdict);
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ list_del(&e->list);
+ kfree(e);
+ }
+
+ sock_put(psock->sock);
+ kfree(psock);
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+ struct bpf_stab *stab)
+{
+ struct smap_psock *psock;
+
+ psock = kzalloc_node(sizeof(struct smap_psock),
+ GFP_ATOMIC | __GFP_NOWARN,
+ stab->map.numa_node);
+ if (!psock)
+ return ERR_PTR(-ENOMEM);
+
+ psock->sock = sock;
+ skb_queue_head_init(&psock->rxqueue);
+ INIT_WORK(&psock->tx_work, smap_tx_work);
+ INIT_WORK(&psock->gc_work, smap_gc_work);
+ INIT_LIST_HEAD(&psock->maps);
+ psock->refcnt = 1;
+
+ rcu_assign_sk_user_data(sock, psock);
+ sock_hold(sock);
+ return psock;
+}
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+ int err = -EINVAL;
+ u64 cost;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ stab = kzalloc(sizeof(*stab), GFP_USER);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ stab->map.map_type = attr->map_type;
+ stab->map.key_size = attr->key_size;
+ stab->map.value_size = attr->value_size;
+ stab->map.max_entries = attr->max_entries;
+ stab->map.map_flags = attr->map_flags;
+ stab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_stab;
+
+ stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(stab->map.pages);
+ if (err)
+ goto free_stab;
+
+ err = -ENOMEM;
+ stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (!stab->sock_map)
+ goto free_stab;
+
+ return &stab->map;
+free_stab:
+ kfree(stab);
+ return ERR_PTR(err);
+}
+
+static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+{
+ struct smap_psock_map_entry *e, *tmp;
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ if (e->entry == entry) {
+ list_del(&e->list);
+ break;
+ }
+ }
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct smap_psock *psock;
+ struct sock *sock;
+
+ sock = xchg(&stab->sock_map[i], NULL);
+ if (!sock)
+ continue;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
+ }
+ rcu_read_unlock();
+
+ if (stab->bpf_verdict)
+ bpf_prog_put(stab->bpf_verdict);
+ if (stab->bpf_parse)
+ bpf_prog_put(stab->bpf_parse);
+
+ sock_map_remove_complete(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (i >= stab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = i + 1;
+ return 0;
+}
+
+struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ return READ_ONCE(stab->sock_map[key]);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock *psock;
+ int k = *(u32 *)key;
+ struct sock *sock;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ sock = xchg(&stab->sock_map[k], NULL);
+ if (!sock)
+ return -EINVAL;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ if (!psock)
+ goto out;
+
+ if (psock->bpf_parse)
+ smap_stop_sock(psock, sock);
+ smap_list_remove(psock, &stab->sock_map[k]);
+ smap_release_sock(psock, sock);
+out:
+ write_unlock_bh(&sock->sk_callback_lock);
+ return 0;
+}
+
+/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
+ * done inside rcu critical sections. This ensures on updates that the psock
+ * will not be released via smap_release_sock() until concurrent updates/deletes
+ * complete. All operations operate on sock_map using cmpxchg and xchg
+ * operations to ensure we do not get stale references. Any reads into the
+ * map must be done with READ_ONCE() because of this.
+ *
+ * A psock is destroyed via call_rcu and after any worker threads are cancelled
+ * and syncd so we are certain all references from the update/lookup/delete
+ * operations as well as references in the data path are no longer in use.
+ *
+ * Psocks may exist in multiple maps, but only a single set of parse/verdict
+ * programs may be inherited from the maps it belongs to. A reference count
+ * is kept with the total number of references to the psock from all maps. The
+ * psock will not be released until this reaches zero. The psock and sock
+ * user data data use the sk_callback_lock to protect critical data structures
+ * from concurrent access. This allows us to avoid two updates from modifying
+ * the user data in sock and the lock is required anyways for modifying
+ * callbacks, we simply increase its scope slightly.
+ *
+ * Rules to follow,
+ * - psock must always be read inside RCU critical section
+ * - sk_user_data must only be modified inside sk_callback_lock and read
+ * inside RCU critical section.
+ * - psock->maps list must only be read & modified inside sk_callback_lock
+ * - sock_map must use READ_ONCE and (cmp)xchg operations
+ * - BPF verdict/parse programs must use READ_ONCE and xchg operations
+ */
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock_map_entry *e = NULL;
+ struct bpf_prog *verdict, *parse;
+ struct sock *osock, *sock;
+ struct smap_psock *psock;
+ u32 i = *(u32 *)key;
+ int err;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= stab->map.max_entries))
+ return -E2BIG;
+
+ sock = READ_ONCE(stab->sock_map[i]);
+ if (flags == BPF_EXIST && !sock)
+ return -ENOENT;
+ else if (flags == BPF_NOEXIST && sock)
+ return -EEXIST;
+
+ sock = skops->sk;
+
+ /* 1. If sock map has BPF programs those will be inherited by the
+ * sock being added. If the sock is already attached to BPF programs
+ * this results in an error.
+ */
+ verdict = READ_ONCE(stab->bpf_verdict);
+ parse = READ_ONCE(stab->bpf_parse);
+
+ if (parse && verdict) {
+ /* bpf prog refcnt may be zero if a concurrent attach operation
+ * removes the program after the above READ_ONCE() but before
+ * we increment the refcnt. If this is the case abort with an
+ * error.
+ */
+ verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ if (IS_ERR(verdict))
+ return PTR_ERR(verdict);
+
+ parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ if (IS_ERR(parse)) {
+ bpf_prog_put(verdict);
+ return PTR_ERR(parse);
+ }
+ }
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+
+ /* 2. Do not allow inheriting programs if psock exists and has
+ * already inherited programs. This would create confusion on
+ * which parser/verdict program is running. If no psock exists
+ * create one. Inside sk_callback_lock to ensure concurrent create
+ * doesn't update user data.
+ */
+ if (psock) {
+ if (READ_ONCE(psock->bpf_parse) && parse) {
+ err = -EBUSY;
+ goto out_progs;
+ }
+ psock->refcnt++;
+ } else {
+ psock = smap_init_psock(sock, stab);
+ if (IS_ERR(psock)) {
+ err = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ set_bit(SMAP_TX_RUNNING, &psock->state);
+ }
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
+ e->entry = &stab->sock_map[i];
+
+ /* 3. At this point we have a reference to a valid psock that is
+ * running. Attach any BPF programs needed.
+ */
+ if (parse && verdict && !psock->strp_enabled) {
+ err = smap_init_sock(psock, sock);
+ if (err)
+ goto out_free;
+ smap_init_progs(psock, stab, verdict, parse);
+ smap_start_sock(psock, sock);
+ }
+
+ /* 4. Place psock in sockmap for use and stop any programs on
+ * the old sock assuming its not the same sock we are replacing
+ * it with. Because we can only have a single set of programs if
+ * old_sock has a strp we can stop it.
+ */
+ list_add_tail(&e->list, &psock->maps);
+ write_unlock_bh(&sock->sk_callback_lock);
+
+ osock = xchg(&stab->sock_map[i], sock);
+ if (osock) {
+ struct smap_psock *opsock = smap_psock_sk(osock);
+
+ write_lock_bh(&osock->sk_callback_lock);
+ if (osock != sock && parse)
+ smap_stop_sock(opsock, osock);
+ smap_list_remove(opsock, &stab->sock_map[i]);
+ smap_release_sock(opsock, osock);
+ write_unlock_bh(&osock->sk_callback_lock);
+ }
+ return 0;
+out_free:
+ smap_release_sock(psock, sock);
+out_progs:
+ if (verdict)
+ bpf_prog_put(verdict);
+ if (parse)
+ bpf_prog_put(parse);
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(e);
+ return err;
+}
+
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *orig;
+
+ if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_SK_SKB_STREAM_PARSER:
+ orig = xchg(&stab->bpf_parse, prog);
+ break;
+ case BPF_SK_SKB_STREAM_VERDICT:
+ orig = xchg(&stab->bpf_verdict, prog);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (orig)
+ bpf_prog_put(orig);
+
+ return 0;
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static int sock_map_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ err = sock_map_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+};
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 31147d730abf..135be433e9a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+ smap->map.numa_node);
if (!smap->elems)
return -ENOMEM;
@@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags)
+ if (attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
@@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = bpf_map_area_alloc(cost);
+ smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
@@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6c772adabad2..25d074920a00 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,47 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#undef BPF_MAP_TYPE
};
+/*
+ * If we're handed a bigger struct than we know of, ensure all the unknown bits
+ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
+ * we don't know about yet.
+ *
+ * There is a ToCToU between this function call and the following
+ * copy_from_user() call. However, this is not a concern since this function is
+ * meant to be a future-proofing of bits.
+ */
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
+ return -E2BIG;
+
+ if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+ return -EFAULT;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
struct bpf_map *map;
@@ -64,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
-void *bpf_map_area_alloc(size_t size)
+void *bpf_map_area_alloc(size_t size, int numa_node)
{
/* We definitely need __GFP_NORETRY, so OOM killer doesn't
* trigger under memory pressure as we really just want to
@@ -74,12 +115,13 @@ void *bpf_map_area_alloc(size_t size)
void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc(size, GFP_USER | flags);
+ area = kmalloc_node(size, GFP_USER | flags, numa_node);
if (area != NULL)
return area;
}
- return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
+ return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+ __builtin_return_address(0));
}
void bpf_map_area_free(void *area)
@@ -144,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
+ unsigned long flags;
+
if (do_idr_lock)
- spin_lock_bh(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
if (do_idr_lock)
- spin_unlock_bh(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
else
__release(&map_idr_lock);
}
@@ -268,10 +312,11 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
+#define BPF_MAP_CREATE_LAST_FIELD numa_node
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
int err;
@@ -279,6 +324,11 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (numa_node != NUMA_NO_NODE &&
+ ((unsigned int)numa_node >= nr_node_ids ||
+ !node_online(numa_node)))
+ return -EINVAL;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -870,7 +920,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
EXPORT_SYMBOL_GPL(bpf_prog_inc);
/* prog_idr_lock should have been held */
-static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
int refold;
@@ -886,6 +936,7 @@ static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
@@ -1047,6 +1098,40 @@ static int bpf_obj_get(const union bpf_attr *attr)
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
+{
+ struct bpf_prog *prog = NULL;
+ int ufd = attr->target_fd;
+ struct bpf_map *map;
+ struct fd f;
+ int err;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (attach) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_SK_SKB);
+ if (IS_ERR(prog)) {
+ fdput(f);
+ return PTR_ERR(prog);
+ }
+ }
+
+ err = sock_map_prog(map, prog, attr->attach_type);
+ if (err) {
+ fdput(f);
+ if (prog)
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ fdput(f);
+ return 0;
+}
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1074,6 +1159,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ return sockmap_get_from_fd(attr, true);
default:
return -EINVAL;
}
@@ -1122,7 +1210,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
cgroup_put(cgrp);
break;
-
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ ret = sockmap_get_from_fd(attr, false);
+ break;
default:
return -EINVAL;
}
@@ -1246,32 +1337,6 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-static int check_uarg_tail_zero(void __user *uaddr,
- size_t expected_size,
- size_t actual_size)
-{
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
- int err;
-
- if (actual_size <= expected_size)
- return 0;
-
- addr = uaddr + expected_size;
- end = uaddr + actual_size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
-
- return 0;
-}
-
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1393,17 +1458,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
- if (!access_ok(VERIFY_READ, uattr, 1))
- return -EFAULT;
-
- if (size > PAGE_SIZE) /* silly large */
- return -E2BIG;
-
- /* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
err = check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 000000000000..1f4bf68c12db
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,180 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value. Each bit can be either
+ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+ return TNUM(value, 0);
+}
+
+struct tnum tnum_range(u64 min, u64 max)
+{
+ u64 chi = min ^ max, delta;
+ u8 bits = fls64(chi);
+
+ /* special case, needed because 1ULL << 64 is undefined */
+ if (bits > 63)
+ return tnum_unknown;
+ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+ * constant min (since min == max).
+ */
+ delta = (1ULL << bits) - 1;
+ return TNUM(min & ~delta, delta);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+ u64 sm, sv, sigma, chi, mu;
+
+ sm = a.mask + b.mask;
+ sv = a.value + b.value;
+ sigma = sm + sv;
+ chi = sigma ^ sv;
+ mu = chi | a.mask | b.mask;
+ return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+ u64 dv, alpha, beta, chi, mu;
+
+ dv = a.value - b.value;
+ alpha = dv + a.mask;
+ beta = dv - b.mask;
+ chi = alpha ^ beta;
+ mu = chi | a.mask | b.mask;
+ return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+ u64 alpha, beta, v;
+
+ alpha = a.value | a.mask;
+ beta = b.value | b.mask;
+ v = a.value & b.value;
+ return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value ^ b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* half-multiply add: acc += (unknown * mask * value).
+ * An intermediate step in the multiply algorithm.
+ */
+static struct tnum hma(struct tnum acc, u64 value, u64 mask)
+{
+ while (mask) {
+ if (mask & 1)
+ acc = tnum_add(acc, TNUM(0, value));
+ mask >>= 1;
+ value <<= 1;
+ }
+ return acc;
+}
+
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+ struct tnum acc;
+ u64 pi;
+
+ pi = a.value * b.value;
+ acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
+ return hma(acc, b.mask, a.value);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask & b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+ a.value &= (1ULL << (size * 8)) - 1;
+ a.mask &= (1ULL << (size * 8)) - 1;
+ return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+ if (!size)
+ return true;
+ return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+ if (b.mask & ~a.mask)
+ return false;
+ b.value &= ~a.mask;
+ return a.value == b.value;
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+ size_t n;
+
+ for (n = 64; n; n--) {
+ if (n < size) {
+ if (a.mask & 1)
+ str[n - 1] = 'x';
+ else if (a.value & 1)
+ str[n - 1] = '1';
+ else
+ str[n - 1] = '0';
+ }
+ a.mask >>= 1;
+ a.value >>= 1;
+ }
+ str[min(size - 1, (size_t)64)] = 0;
+ return 64;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 664d93972373..b914fbe1383e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -61,12 +61,12 @@
* (and -20 constant is saved for further stack bounds checking).
* Meaning that this reg is a pointer to stack plus known immediate constant.
*
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
* means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
*
* When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
* types recognized by check_mem_access() function.
*
* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
@@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 98304
+#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
@@ -180,15 +180,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
- [UNKNOWN_VALUE] = "inv",
+ [SCALAR_VALUE] = "inv",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
- [FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
- [CONST_IMM] = "imm",
[PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_END] = "pkt_end",
};
@@ -221,32 +218,52 @@ static void print_verifier_state(struct bpf_verifier_state *state)
if (t == NOT_INIT)
continue;
verbose(" R%d=%s", i, reg_type_str[t]);
- if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%lld", reg->imm);
- else if (t == PTR_TO_PACKET)
- verbose("(id=%d,off=%d,r=%d)",
- reg->id, reg->off, reg->range);
- else if (t == UNKNOWN_VALUE && reg->imm)
- verbose("%lld", reg->imm);
- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL ||
- t == PTR_TO_MAP_VALUE_ADJ)
- verbose("(ks=%d,vs=%d,id=%u)",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size,
- reg->id);
- if (reg->min_value != BPF_REGISTER_MIN_RANGE)
- verbose(",min_value=%lld",
- (long long)reg->min_value);
- if (reg->max_value != BPF_REGISTER_MAX_RANGE)
- verbose(",max_value=%llu",
- (unsigned long long)reg->max_value);
- if (reg->min_align)
- verbose(",min_align=%u", reg->min_align);
- if (reg->aux_off)
- verbose(",aux_off=%u", reg->aux_off);
- if (reg->aux_off_align)
- verbose(",aux_off_align=%u", reg->aux_off_align);
+ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+ tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose("%lld", reg->var_off.value + reg->off);
+ } else {
+ verbose("(id=%d", reg->id);
+ if (t != SCALAR_VALUE)
+ verbose(",off=%d", reg->off);
+ if (t == PTR_TO_PACKET)
+ verbose(",r=%d", reg->range);
+ else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose(",ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ if (tnum_is_const(reg->var_off)) {
+ /* Typically an immediate SCALAR_VALUE, but
+ * could be a pointer whose offset is too big
+ * for reg->off
+ */
+ verbose(",imm=%llx", reg->var_off.value);
+ } else {
+ if (reg->smin_value != reg->umin_value &&
+ reg->smin_value != S64_MIN)
+ verbose(",smin_value=%lld",
+ (long long)reg->smin_value);
+ if (reg->smax_value != reg->umax_value &&
+ reg->smax_value != S64_MAX)
+ verbose(",smax_value=%lld",
+ (long long)reg->smax_value);
+ if (reg->umin_value != 0)
+ verbose(",umin_value=%llu",
+ (unsigned long long)reg->umin_value);
+ if (reg->umax_value != U64_MAX)
+ verbose(",umax_value=%llu",
+ (unsigned long long)reg->umax_value);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(",var_off=%s", tn_buf);
+ }
+ }
+ verbose(")");
+ }
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -295,11 +312,15 @@ static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
[BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
[BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
[BPF_JSET >> 4] = "&",
[BPF_JNE >> 4] = "!=",
[BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
[BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
[BPF_CALL >> 4] = "call",
[BPF_EXIT >> 4] = "exit",
};
@@ -463,56 +484,163 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg);
+
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- BUG_ON(regno >= MAX_BPF_REG);
+ reg->id = 0;
+ reg->var_off = tnum_const(imm);
+ reg->smin_value = (s64)imm;
+ reg->smax_value = (s64)imm;
+ reg->umin_value = imm;
+ reg->umax_value = imm;
+}
- memset(&regs[regno], 0, sizeof(regs[regno]));
- regs[regno].type = NOT_INIT;
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+/* Mark the 'variable offset' part of a register as zero. This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
{
- int i;
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_known_zero(regs + regno);
+}
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_reg_not_init(regs, i);
+/* Attempts to improve min/max values based on var_off information */
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->smin_value = max_t(s64, reg->smin_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->smax_value = min_t(s64, reg->smax_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MAX));
+ reg->umin_value = max(reg->umin_value, reg->var_off.value);
+ reg->umax_value = min(reg->umax_value,
+ reg->var_off.value | reg->var_off.mask);
+}
- /* frame pointer */
- regs[BPF_REG_FP].type = FRAME_PTR;
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+ /* Learn sign from signed bounds.
+ * If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if (reg->smin_value >= 0 || reg->smax_value < 0) {
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ return;
+ }
+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
+ * boundary, so we must be careful.
+ */
+ if ((s64)reg->umax_value >= 0) {
+ /* Positive. We can't learn anything from the smin, but smax
+ * is positive, hence safe.
+ */
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ } else if ((s64)reg->umin_value < 0) {
+ /* Negative. We can't learn anything from the smax, but smin
+ * is negative, hence safe.
+ */
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value;
+ }
+}
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
+{
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->umin_value,
+ reg->umax_value));
+}
+
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+{
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
}
-static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown(struct bpf_reg_state *reg)
{
- regs[regno].type = UNKNOWN_VALUE;
- regs[regno].id = 0;
- regs[regno].imm = 0;
+ reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->off = 0;
+ reg->var_off = tnum_unknown;
+ __mark_reg_unbounded(reg);
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
- __mark_reg_unknown_value(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_unknown(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_unknown(regs + regno);
}
-static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg)
{
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
- regs[regno].value_from_signed = false;
- regs[regno].min_align = 0;
+ __mark_reg_unknown(reg);
+ reg->type = NOT_INIT;
}
-static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
- u32 regno)
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
{
- mark_reg_unknown_value(regs, regno);
- reset_reg_range_values(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_not_init(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_not_init(regs + regno);
+}
+
+static void init_reg_state(struct bpf_reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ mark_reg_not_init(regs, i);
+ regs[i].live = REG_LIVE_NONE;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = PTR_TO_STACK;
+ mark_reg_known_zero(regs, BPF_REG_FP);
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -521,9 +649,26 @@ enum reg_arg_type {
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->regs[regno].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->regs[regno].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
return -EINVAL;
@@ -535,14 +680,16 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
+ mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose("frame pointer is read only\n");
return -EACCES;
}
+ regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown_value(regs, regno);
+ mark_reg_unknown(regs, regno);
}
return 0;
}
@@ -552,12 +699,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
- case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
case PTR_TO_PACKET_END:
- case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
default:
@@ -571,7 +716,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
static int check_stack_write(struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i;
+ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
@@ -586,15 +731,14 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
}
/* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
+ state->spilled_regs[spi] = state->regs[value_regno];
+ state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct bpf_reg_state) {};
+ state->spilled_regs[spi] = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -602,11 +746,26 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
return 0;
}
+static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
- int i;
+ int i, spi;
slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
@@ -622,10 +781,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
}
- if (value_regno >= 0)
+ spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+
+ if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] =
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ state->regs[value_regno] = state->spilled_regs[spi];
+ mark_stack_slot_read(state, spi);
+ }
return 0;
} else {
for (i = 0; i < size; i++) {
@@ -637,14 +799,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -657,49 +818,50 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check read/write into an adjusted map element */
-static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size)
{
struct bpf_verifier_state *state = &env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We adjusted the register to this map value, so we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
+ /* We may have adjusted the register to this map value, so we
+ * need to try adding each of min_value and max_value to off
+ * to make sure our theoretical access will be safe.
*/
if (log_level)
print_verifier_state(state);
- env->varlen_map_value_access = true;
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->min_value < 0) {
+ if (reg->smin_value < 0) {
verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = check_map_access(env, regno, reg->min_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size);
if (err) {
- verbose("R%d min value is outside of the array range\n",
- regno);
+ verbose("R%d min value is outside of the array range\n", regno);
return err;
}
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
+ /* If we haven't set a max value then we need to bail since we can't be
+ * sure we won't do bad things.
+ * If reg->umax_value + off could overflow, treat that as unbounded too.
*/
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- return check_map_access(env, regno, reg->max_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size);
+ if (err)
+ verbose("R%d max value is outside of the array range\n", regno);
+ return err;
}
#define MAX_PACKET_OFF 0xffff
@@ -719,6 +881,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_SK_SKB:
if (meta)
return meta->pkt_access;
@@ -729,14 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size)
{
struct bpf_reg_state *regs = env->cur_state.regs;
struct bpf_reg_state *reg = &regs[regno];
- off += reg->off;
- if (off < 0 || size <= 0 || off + size > reg->range) {
+ if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
@@ -744,7 +906,35 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check access to 'struct bpf_context' fields */
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
+ int err;
+
+ /* We may have added a variable offset to the packet pointer; but any
+ * reg->range we have comes after that. We are only checking the fixed
+ * offset.
+ */
+
+ /* We don't allow negative numbers, because we aren't tracking enough
+ * detail to prove they're safe.
+ */
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_packet_access(env, regno, off, size);
+ if (err) {
+ verbose("R%d offset is outside of the packet\n", regno);
+ return err;
+ }
+ return err;
+}
+
+/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
@@ -784,13 +974,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
if (allow_ptr_leaks)
return false;
- switch (reg->type) {
- case UNKNOWN_VALUE:
- case CONST_IMM:
- return false;
- default:
- return true;
- }
+ return reg->type != SCALAR_VALUE;
}
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
@@ -801,23 +985,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
+ struct tnum reg_off;
int ip_align;
- int reg_off;
/* Byte size accesses are always allowed. */
if (!strict || size == 1)
return 0;
- reg_off = reg->off;
- if (reg->id) {
- if (reg->aux_off_align % size) {
- verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
- reg->aux_off_align, size);
- return -EACCES;
- }
- reg_off += reg->aux_off;
- }
-
/* For platforms that do not have a Kconfig enabling
* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
* NET_IP_ALIGN is universally set to '2'. And on platforms
@@ -827,20 +1001,37 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
* unconditional IP align value of '2'.
*/
ip_align = 2;
- if ((ip_align + reg_off + off) % size != 0) {
- verbose("misaligned packet access off %d+%d+%d size %d\n",
- ip_align, reg_off, off, size);
+
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
return 0;
}
-static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
- int size, bool strict)
+static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+ const char *pointer_desc,
+ int off, int size, bool strict)
{
- if (strict && size != 1) {
- verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ struct tnum reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -852,21 +1043,25 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
int off, int size)
{
bool strict = env->strict_alignment;
+ const char *pointer_desc = "";
switch (reg->type) {
case PTR_TO_PACKET:
+ /* special case, because of NET_IP_ALIGN */
return check_pkt_ptr_alignment(reg, off, size, strict);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_val_ptr_alignment(reg, size, strict);
+ case PTR_TO_MAP_VALUE:
+ pointer_desc = "value ";
+ break;
+ case PTR_TO_CTX:
+ pointer_desc = "context ";
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+ break;
default:
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- }
-
- return 0;
+ break;
}
+ return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -883,52 +1078,79 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
- if (reg->type == PTR_TO_STACK)
- off += reg->imm;
-
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
+ /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size);
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE ||
- reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ /* for access checks, reg->off is just part of off */
+ off += reg->off;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- if (reg->type == PTR_TO_MAP_VALUE_ADJ)
- err = check_map_access_adj(env, regno, off, size);
- else
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+ enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
+ /* ctx accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable ctx access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
- /* note that reg.[id|off|range] == 0 */
+ /* ctx access returns either a scalar, or a
+ * PTR_TO_PACKET[_END]. In the latter case, we know
+ * the offset is zero.
+ */
+ if (reg_type == SCALAR_VALUE)
+ mark_reg_unknown(state->regs, value_regno);
+ else
+ mark_reg_known_zero(state->regs, value_regno);
+ state->regs[value_regno].id = 0;
+ state->regs[value_regno].off = 0;
+ state->regs[value_regno].range = 0;
state->regs[value_regno].type = reg_type;
- state->regs[value_regno].aux_off = 0;
- state->regs[value_regno].aux_off_align = 0;
}
- } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+ } else if (reg->type == PTR_TO_STACK) {
+ /* stack accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ * See check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -948,7 +1170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
err = check_stack_read(state, off, size, value_regno);
}
- } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ } else if (reg->type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
@@ -960,28 +1182,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
return -EACCES;
}
- if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
- state->regs[value_regno].type == UNKNOWN_VALUE) {
- /* 1 or 2 byte load zero-extends, determine the number of
- * zero upper bits. Not doing it fo 4 byte load, since
- * such values cannot be added to ptr_to_packet anyway.
- */
- state->regs[value_regno].imm = 64 - size * 8;
+ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+ state->regs[value_regno].type == SCALAR_VALUE) {
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ state->regs[value_regno].var_off = tnum_cast(
+ state->regs[value_regno].var_off, size);
+ __update_reg_bounds(&state->regs[value_regno]);
}
return err;
}
static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -991,12 +1210,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1016,9 +1235,17 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state reg)
+{
+ return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
+ * and all elements of stack are initialized.
+ * Unlike most pointer bounds-checking functions, this one doesn't take an
+ * 'off' argument, so it has to add in reg->off itself.
*/
static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
@@ -1029,9 +1256,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int off, i;
if (regs[regno].type != PTR_TO_STACK) {
+ /* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- regs[regno].type == CONST_IMM &&
- regs[regno].imm == 0)
+ register_is_null(regs[regno]))
return 0;
verbose("R%d type=%s expected=%s\n", regno,
@@ -1040,7 +1267,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- off = regs[regno].imm;
+ /* Only allow fixed-offset stack reads */
+ if (!tnum_is_const(regs[regno].var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ verbose("invalid variable stack read R%d var_off=%s\n",
+ regno, tn_buf);
+ }
+ off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size <= 0) {
verbose("invalid stack type R%d off=%d access_size=%d\n",
@@ -1071,16 +1306,14 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
- switch (regs[regno].type) {
+ switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, 0, access_size);
+ return check_packet_access(env, regno, reg->off, access_size);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, 0, access_size);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_map_access_adj(env, regno, 0, access_size);
- default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_map_access(env, regno, reg->off, access_size);
+ default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
}
@@ -1097,10 +1330,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_DONTCARE)
return 0;
- if (type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
- return -EACCES;
- }
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
@@ -1123,11 +1355,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
- expected_type = CONST_IMM;
- /* One exception. Allow UNKNOWN_VALUE registers when the
- * boundaries are known and don't cause unsafe memory accesses
- */
- if (type != UNKNOWN_VALUE && type != expected_type)
+ expected_type = SCALAR_VALUE;
+ if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1141,13 +1370,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a CONST_IMM type. Final test
+ * passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (type == CONST_IMM && reg->imm == 0)
+ if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
+ type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
@@ -1173,7 +1402,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->key_size);
else
err = check_stack_boundary(env, regno,
@@ -1189,7 +1418,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->value_size);
else
err = check_stack_boundary(env, regno,
@@ -1209,10 +1438,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
- /* If the register is UNKNOWN_VALUE, the access check happens
- * using its boundaries. Otherwise, just use its imm
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
*/
- if (type == UNKNOWN_VALUE) {
+
+ if (!tnum_is_const(reg->var_off))
/* For unprivileged variable accesses, disable raw
* mode so that the program is required to
* initialize all the memory that the helper could
@@ -1220,35 +1450,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
meta = NULL;
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->min_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->max_value,
- zero_size_allowed, meta);
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
if (err)
return err;
- } else {
- /* register is CONST_IMM */
- err = check_helper_mem_access(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
}
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
}
return err;
@@ -1283,10 +1506,25 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ /* devmap returns a pointer to a live net_device ifindex that we cannot
+ * allow to be modified from bpf side. So do not allow lookup elements
+ * for now.
+ */
+ case BPF_MAP_TYPE_DEVMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ if (func_id != BPF_FUNC_sk_redirect_map &&
+ func_id != BPF_FUNC_sock_map_update &&
+ func_id != BPF_FUNC_map_delete_elem)
+ goto error;
+ break;
default:
break;
}
@@ -1311,6 +1549,18 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
break;
+ case BPF_FUNC_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sk_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sock_map_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
default:
break;
}
@@ -1340,6 +1590,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
+/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
+ * so turn them into unknown SCALAR_VALUE.
+ */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = &env->cur_state;
@@ -1349,7 +1602,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET ||
regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown_value(regs, i);
+ mark_reg_unknown(regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1358,8 +1611,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- __mark_reg_unknown_value(state->spilled_regs,
- i / BPF_REG_SIZE);
+ __mark_reg_unknown(reg);
}
}
@@ -1434,19 +1686,24 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
}
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
- /* update return register */
+ /* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ /* sets type to SCALAR_VALUE */
+ mark_reg_unknown(regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
struct bpf_insn_aux_data *insn_aux;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
- regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
+ /* There is no offset yet applied, variable or fixed */
+ mark_reg_known_zero(regs, BPF_REG_0);
+ regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -1477,494 +1734,551 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static int check_packet_ptr_add(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static void coerce_reg_to_32(struct bpf_reg_state *reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- struct bpf_reg_state tmp_reg;
- s32 imm;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- /* pkt_ptr += imm */
- imm = insn->imm;
-
-add_imm:
- if (imm < 0) {
- verbose("addition of negative constant to packet pointer is not allowed\n");
- return -EACCES;
- }
- if (imm >= MAX_PACKET_OFF ||
- imm + dst_reg->off >= MAX_PACKET_OFF) {
- verbose("constant %d is too large to add to packet pointer\n",
- imm);
- return -EACCES;
- }
- /* a constant was added to pkt_ptr.
- * Remember it while keeping the same 'id'
- */
- dst_reg->off += imm;
- } else {
- bool had_id;
-
- if (src_reg->type == PTR_TO_PACKET) {
- /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
- tmp_reg = *dst_reg; /* save r7 state */
- *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
- src_reg = &tmp_reg; /* pretend it's src_reg state */
- /* if the checks below reject it, the copy won't matter,
- * since we're rejecting the whole program. If all ok,
- * then imm22 state will be added to r7
- * and r7 will be pkt(id=0,off=22,r=62) while
- * r6 will stay as pkt(id=0,off=0,r=62)
- */
- }
+ /* clear high 32 bits */
+ reg->var_off = tnum_cast(reg->var_off, 4);
+ /* Update bounds */
+ __update_reg_bounds(reg);
+}
- if (src_reg->type == CONST_IMM) {
- /* pkt_ptr += reg where reg is known constant */
- imm = src_reg->imm;
- goto add_imm;
- }
- /* disallow pkt_ptr += reg
- * if reg is not uknown_value with guaranteed zero upper bits
- * otherwise pkt_ptr may overflow and addition will become
- * subtraction which is not allowed
- */
- if (src_reg->type != UNKNOWN_VALUE) {
- verbose("cannot add '%s' to ptr_to_packet\n",
- reg_type_str[src_reg->type]);
- return -EACCES;
- }
- if (src_reg->imm < 48) {
- verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
- src_reg->imm);
- return -EACCES;
- }
+static bool signed_add_overflows(s64 a, s64 b)
+{
+ /* Do the add in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a + (u64)b);
- had_id = (dst_reg->id != 0);
+ if (b < 0)
+ return res > a;
+ return res < a;
+}
- /* dst_reg stays as pkt_ptr type and since some positive
- * integer value was added to the pointer, increment its 'id'
- */
- dst_reg->id = ++env->id_gen;
-
- /* something was added to pkt_ptr, set range to zero */
- dst_reg->aux_off += dst_reg->off;
- dst_reg->off = 0;
- dst_reg->range = 0;
- if (had_id)
- dst_reg->aux_off_align = min(dst_reg->aux_off_align,
- src_reg->min_align);
- else
- dst_reg->aux_off_align = src_reg->min_align;
- }
- return 0;
+static bool signed_sub_overflows(s64 a, s64 b)
+{
+ /* Do the sub in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a - (u64)b);
+
+ if (b < 0)
+ return res < a;
+ return res > a;
}
-static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ bool known = tnum_is_const(off_reg->var_off);
+ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
u8 opcode = BPF_OP(insn->code);
- s64 imm_log2;
+ u32 dst = insn->dst_reg;
- /* for type == UNKNOWN_VALUE:
- * imm > 0 -> number of zero upper bits
- * imm == 0 -> don't track which is the same as all bits can be non-zero
- */
+ dst_reg = &regs[dst];
- if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(dst_reg->imm, src_reg->imm);
- dst_reg->imm--;
- return 0;
- }
- if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where dreg has zero upper bits and sreg is const.
- * Adding them can only result making one more bit
- * non-zero in the larger value.
- */
- imm_log2 = __ilog2_u64((long long)src_reg->imm);
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- return 0;
- }
- /* all other cases non supported yet, just mark dst_reg */
- dst_reg->imm = 0;
- return 0;
+ if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad sbounds\n");
+ return -EINVAL;
+ }
+ if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad ubounds\n");
+ return -EINVAL;
}
- /* sign extend 32-bit imm into 64-bit to make sure that
- * negative values occupy bit 63. Note ilog2() would have
- * been incorrect, since sizeof(insn->imm) == 4
- */
- imm_log2 = __ilog2_u64((long long)insn->imm);
-
- if (dst_reg->imm && opcode == BPF_LSH) {
- /* reg <<= imm
- * if reg was a result of 2 byte load, then its imm == 48
- * which means that upper 48 bits are zero and shifting this reg
- * left by 4 would mean that upper 44 bits are still zero
- */
- dst_reg->imm -= insn->imm;
- } else if (dst_reg->imm && opcode == BPF_MUL) {
- /* reg *= imm
- * if multiplying by 14 subtract 4
- * This is conservative calculation of upper zero bits.
- * It's not trying to special case insn->imm == 1 or 0 cases
- */
- dst_reg->imm -= imm_log2 + 1;
- } else if (opcode == BPF_AND) {
- /* reg &= imm */
- dst_reg->imm = 63 - imm_log2;
- } else if (dst_reg->imm && opcode == BPF_ADD) {
- /* reg += imm */
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- } else if (opcode == BPF_RSH) {
- /* reg >>= imm
- * which means that after right shift, upper bits will be zero
- * note that verifier already checked that
- * 0 <= imm < 64 for shift insn
- */
- dst_reg->imm += insn->imm;
- if (unlikely(dst_reg->imm > 64))
- /* some dumb code did:
- * r2 = *(u32 *)mem;
- * r2 >>= 32;
- * and all bits are zero now */
- dst_reg->imm = 64;
- } else {
- /* all other alu ops, means that we don't know what will
- * happen to the value, mark it with unknown number of zero bits
- */
- dst_reg->imm = 0;
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
+ return -EACCES;
}
- if (dst_reg->imm < 0) {
- /* all 64 bits of the register can contain non-zero bits
- * and such value cannot be added to ptr_to_packet, since it
- * may overflow, mark it as unknown to avoid further eval
- */
- dst_reg->imm = 0;
+ if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == CONST_PTR_TO_MAP) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == PTR_TO_PACKET_END) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
+ return -EACCES;
}
- return 0;
-}
-static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- u8 opcode = BPF_OP(insn->code);
- s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
-
- /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
- if (src_reg->imm > 0 && dst_reg->imm) {
- switch (opcode) {
- case BPF_ADD:
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- break;
- case BPF_AND:
- /* dreg &= sreg
- * AND can not extend zero bits only shrink
- * Ex. 0x00..00ffffff
- * & 0x0f..ffffffff
- * ----------------
- * 0x00..00ffffff
- */
- dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+ * The id may be overwritten later if we create a new variable offset.
+ */
+ dst_reg->type = ptr_reg->type;
+ dst_reg->id = ptr_reg->id;
+
+ switch (opcode) {
+ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
+ */
+ if (known && (ptr_reg->off + smin_val ==
+ (s64)(s32)(ptr_reg->off + smin_val))) {
+ /* pointer += K. Accumulate it into fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->off = ptr_reg->off + smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_OR:
- /* dreg |= sreg
- * OR can only extend zero bits
- * Ex. 0x00..00ffffff
- * | 0x0f..ffffffff
- * ----------------
- * 0x0f..00ffffff
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ }
+ /* A new variable offset is created. Note that off_reg->off
+ * == 0, since it's a scalar.
+ * dst_reg gets the pointer type and since some positive
+ * integer value was added to the pointer, give it a new 'id'
+ * if it's a PTR_TO_PACKET.
+ * this creates a new 'base' pointer, off_reg (variable) gets
+ * added into the variable offset, and we copy the fixed offset
+ * from ptr_reg.
+ */
+ if (signed_add_overflows(smin_ptr, smin_val) ||
+ signed_add_overflows(smax_ptr, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr + smin_val;
+ dst_reg->smax_value = smax_ptr + smax_val;
+ }
+ if (umin_ptr + umin_val < umin_ptr ||
+ umax_ptr + umax_val < umax_ptr) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value = umin_ptr + umin_val;
+ dst_reg->umax_value = umax_ptr + umax_val;
+ }
+ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_SUB:
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d tried to subtract pointer from scalar\n",
+ dst);
+ return -EACCES;
+ }
+ /* We don't allow subtraction from FP, because (according to
+ * test_verifier.c test "invalid fp arithmetic", JITs might not
+ * be able to deal with it.
+ */
+ if (ptr_reg->type == PTR_TO_STACK) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d subtraction from stack pointer prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (known && (ptr_reg->off - smin_val ==
+ (s64)(s32)(ptr_reg->off - smin_val))) {
+ /* pointer -= K. Subtract it from fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->id = ptr_reg->id;
+ dst_reg->off = ptr_reg->off - smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_SUB:
- case BPF_MUL:
- case BPF_RSH:
- case BPF_LSH:
- /* These may be flushed out later */
- default:
- mark_reg_unknown_value(regs, insn->dst_reg);
}
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
+ /* A new variable offset is created. If the subtrahend is known
+ * nonnegative, then any reg->range we had before is still good.
+ */
+ if (signed_sub_overflows(smin_ptr, smax_val) ||
+ signed_sub_overflows(smax_ptr, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr - smax_val;
+ dst_reg->smax_value = smax_ptr - smin_val;
+ }
+ if (umin_ptr < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value = umin_ptr - umax_val;
+ dst_reg->umax_value = umax_ptr - umin_val;
+ }
+ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ if (smin_val < 0)
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* bitwise ops on pointers are troublesome, prohibit for now.
+ * (However, in principle we could allow some cases, e.g.
+ * ptr &= ~3 which would reduce min_value by 3.)
+ */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ default:
+ /* other operators (e.g. MUL,LSH) produce non-pointer results */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
}
- dst_reg->type = UNKNOWN_VALUE;
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
return 0;
}
-static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state src_reg)
{
struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
- u64 dst_imm = dst_reg->imm;
-
- if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
- return evaluate_reg_imm_alu_unknown(env, insn);
-
- /* dst_reg->type == CONST_IMM here. Simulate execution of insns
- * containing ALU ops. Don't care about overflow or negative
- * values, just add/sub/... them; registers are in u64.
- */
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
- dst_imm += insn->imm;
- } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm += src_reg->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
- dst_imm -= insn->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm -= src_reg->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
- dst_imm *= insn->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm *= src_reg->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
- dst_imm |= insn->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm |= src_reg->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
- dst_imm &= insn->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm &= src_reg->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm >>= insn->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm >>= src_reg->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm <<= insn->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm <<= src_reg->imm;
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
- goto out;
- }
-
- dst_reg->imm = dst_imm;
-out:
- return 0;
-}
-
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
- if (reg->max_value > BPF_REGISTER_MAX_RANGE)
- reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
- reg->min_value > BPF_REGISTER_MAX_RANGE)
- reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
-static u32 calc_align(u32 imm)
-{
- if (!imm)
- return 1U << 31;
- return imm - ((imm - 1) & imm);
-}
-
-static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
- s64 min_val = BPF_REGISTER_MIN_RANGE;
- u64 max_val = BPF_REGISTER_MAX_RANGE;
- u8 opcode = BPF_OP(insn->code);
- u32 dst_align, src_align;
-
- dst_reg = &regs[insn->dst_reg];
- src_align = 0;
- if (BPF_SRC(insn->code) == BPF_X) {
- check_reg_overflow(&regs[insn->src_reg]);
- min_val = regs[insn->src_reg].min_value;
- max_val = regs[insn->src_reg].max_value;
-
- /* If the source register is a random pointer then the
- * min_value/max_value values represent the range of the known
- * accesses into that value, not the actual min/max value of the
- * register itself. In this case we have to reset the reg range
- * values so we know it is not safe to look at.
- */
- if (regs[insn->src_reg].type != CONST_IMM &&
- regs[insn->src_reg].type != UNKNOWN_VALUE) {
- min_val = BPF_REGISTER_MIN_RANGE;
- max_val = BPF_REGISTER_MAX_RANGE;
- src_align = 0;
- } else {
- src_align = regs[insn->src_reg].min_align;
- }
- } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
- (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
- min_val = max_val = insn->imm;
- src_align = calc_align(insn->imm);
- }
-
- dst_align = dst_reg->min_align;
-
- /* We don't know anything about what was done to this register, mark it
- * as unknown. Also, if both derived bounds came from signed/unsigned
- * mixed compares and one side is unbounded, we cannot really do anything
- * with them as boundaries cannot be trusted. Thus, arithmetic of two
- * regs of such kind will get invalidated bounds on the dst side.
- */
- if ((min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (BPF_SRC(insn->code) == BPF_X &&
- ((min_val != BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val != BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
- regs[insn->dst_reg].value_from_signed !=
- regs[insn->src_reg].value_from_signed)) {
- reset_reg_range_values(regs, insn->dst_reg);
- return;
- }
-
- /* If one of our values was at the end of our ranges then we can't just
- * do our normal operations to the register, we need to set the values
- * to the min/max since they are undefined.
- */
- if (opcode != BPF_SUB) {
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ bool src_known, dst_known;
+ s64 smin_val, smax_val;
+ u64 umin_val, umax_val;
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->64 */
+ coerce_reg_to_32(dst_reg);
+ coerce_reg_to_32(&src_reg);
}
+ smin_val = src_reg.smin_value;
+ smax_val = src_reg.smax_value;
+ umin_val = src_reg.umin_value;
+ umax_val = src_reg.umax_value;
+ src_known = tnum_is_const(src_reg.var_off);
+ dst_known = tnum_is_const(dst_reg->var_off);
switch (opcode) {
case BPF_ADD:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value += min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value += max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value += smin_val;
+ dst_reg->smax_value += smax_val;
+ }
+ if (dst_reg->umin_value + umin_val < umin_val ||
+ dst_reg->umax_value + umax_val < umax_val) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value += umin_val;
+ dst_reg->umax_value += umax_val;
+ }
+ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- /* If one of our values was at the end of our ranges, then the
- * _opposite_ value in the dst_reg goes to the end of our range.
- */
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= max_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= min_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value -= smax_val;
+ dst_reg->smax_value -= smin_val;
+ }
+ if (dst_reg->umin_value < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value -= umax_val;
+ dst_reg->umax_value -= umin_val;
+ }
+ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
case BPF_MUL:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value *= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value *= max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+ if (smin_val < 0 || dst_reg->smin_value < 0) {
+ /* Ain't nobody got time to multiply that sign */
+ __mark_reg_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ /* Both values are positive, so we can work with unsigned and
+ * copy the result to signed (unless it exceeds S64_MAX).
+ */
+ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
+ /* Potential overflow, we know nothing */
+ __mark_reg_unbounded(dst_reg);
+ /* (except what we can learn from the var_off) */
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ dst_reg->umin_value *= umin_val;
+ dst_reg->umax_value *= umax_val;
+ if (dst_reg->umax_value > S64_MAX) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
break;
case BPF_AND:
- /* Disallow AND'ing of negative numbers, ain't nobody got time
- * for that. Otherwise the minimum is 0 and the max is the max
- * value we could AND against.
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value &
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
*/
- if (min_val < 0)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
- dst_reg->min_value = 0;
- dst_reg->max_value = max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ANDing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ /* ANDing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_LSH:
- /* Gotta have special overflow logic here, if we're shifting
- * more than MAX_RANGE then just assume we have an invalid
- * range.
+ case BPF_OR:
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value |
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- dst_reg->min_align = 1;
+ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+ dst_reg->umax_value = dst_reg->var_off.value |
+ dst_reg->var_off.mask;
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ORing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
} else {
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
- dst_reg->min_align <<= min_val;
- }
- if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value <<= max_val;
+ /* ORing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_RSH:
- /* RSH by a negative number is undefined, and the BPF_RSH is an
- * unsigned shift, so make the appropriate casts.
+ case BPF_LSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
+ }
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
*/
- if (min_val < 0 || dst_reg->min_value < 0) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ /* If we might shift our top bit out, then we know nothing */
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
} else {
- dst_reg->min_value =
- (u64)(dst_reg->min_value) >> min_val;
+ dst_reg->umin_value <<= umin_val;
+ dst_reg->umax_value <<= umax_val;
}
- if (min_val < 0) {
- dst_reg->min_align = 1;
+ if (src_known)
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+ else
+ dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+ break;
+ case BPF_RSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
+ }
+ /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+ if (dst_reg->smin_value < 0) {
+ if (umin_val) {
+ /* Sign bit will be cleared */
+ dst_reg->smin_value = 0;
+ } else {
+ /* Lost sign bit information */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
} else {
- dst_reg->min_align >>= (u64) min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
+ dst_reg->smin_value =
+ (u64)(dst_reg->smin_value) >> umax_val;
}
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value >>= max_val;
+ if (src_known)
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off,
+ umin_val);
+ else
+ dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->umin_value >>= umax_val;
+ dst_reg->umax_value >>= umin_val;
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
default:
- reset_reg_range_values(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
break;
}
- check_reg_overflow(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
+ return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ u8 opcode = BPF_OP(insn->code);
+ int rc;
+
+ dst_reg = &regs[insn->dst_reg];
+ src_reg = NULL;
+ if (dst_reg->type != SCALAR_VALUE)
+ ptr_reg = dst_reg;
+ if (BPF_SRC(insn->code) == BPF_X) {
+ src_reg = &regs[insn->src_reg];
+ if (src_reg->type != SCALAR_VALUE) {
+ if (dst_reg->type != SCALAR_VALUE) {
+ /* Combining two pointers by any ALU op yields
+ * an arbitrary scalar.
+ */
+ if (!env->allow_ptr_leaks) {
+ verbose("R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ }
+ mark_reg_unknown(regs, insn->dst_reg);
+ return 0;
+ } else {
+ /* scalar += pointer
+ * This is legal, but we have to reverse our
+ * src/dest handling in computing the range
+ */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* scalar += unknown scalar */
+ __mark_reg_unknown(&off_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn,
+ dst_reg, off_reg);
+ }
+ return rc;
+ }
+ } else if (ptr_reg) {
+ /* pointer += scalar */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += scalar */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, *src_reg);
+ }
+ return rc;
+ }
+ } else {
+ /* Pretend the src is a reg with a known value, since we only
+ * need to be able to read from this state.
+ */
+ off_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&off_reg, insn->imm);
+ src_reg = &off_reg;
+ if (ptr_reg) { /* pointer += K */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += K */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, off_reg);
+ }
+ return rc;
+ }
+ }
+
+ /* Got here implies adding two SCALAR_VALUEs */
+ if (WARN_ON_ONCE(ptr_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: unexpected ptr_reg\n");
+ return -EINVAL;
+ }
+ if (WARN_ON(!src_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: no src_reg\n");
+ return -EINVAL;
+ }
+ return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1978,14 +2292,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ BPF_CLASS(insn->code) == BPF_ALU64) {
verbose("BPF_END uses reserved fields\n");
return -EINVAL;
}
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1996,7 +2311,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -2009,7 +2324,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -2020,15 +2335,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
- /* we are setting our register to something new, we need to
- * reset its range values.
- */
- reset_reg_range_values(regs, insn->dst_reg);
-
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -2036,24 +2346,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ /* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
verbose("R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown_value(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
+ /* high 32 bits are known zero. */
+ regs[insn->dst_reg].var_off = tnum_cast(
+ regs[insn->dst_reg].var_off, 4);
+ __update_reg_bounds(&regs[insn->dst_reg]);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- regs[insn->dst_reg].id = 0;
- regs[insn->dst_reg].max_value = insn->imm;
- regs[insn->dst_reg].min_value = insn->imm;
- regs[insn->dst_reg].min_align = calc_align(insn->imm);
- regs[insn->dst_reg].value_from_signed = false;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(regs + insn->dst_reg, insn->imm);
}
} else if (opcode > BPF_END) {
@@ -2068,7 +2378,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -2079,7 +2389,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2100,72 +2410,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
- dst_reg = &regs[insn->dst_reg];
-
- /* first we want to adjust our ranges. */
- adjust_reg_min_max_vals(env, insn);
-
- /* pattern match 'bpf_add Rx, imm' instruction */
- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
- dst_reg->type = PTR_TO_STACK;
- dst_reg->imm = insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == PTR_TO_STACK &&
- ((BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == CONST_IMM) ||
- BPF_SRC(insn->code) == BPF_K)) {
- if (BPF_SRC(insn->code) == BPF_X)
- dst_reg->imm += regs[insn->src_reg].imm;
- else
- dst_reg->imm += insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- (dst_reg->type == PTR_TO_PACKET ||
- (BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == PTR_TO_PACKET))) {
- /* ptr_to_packet += K|X */
- return check_packet_ptr_add(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == UNKNOWN_VALUE &&
- env->allow_ptr_leaks) {
- /* unknown += K|X */
- return evaluate_reg_alu(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == CONST_IMM &&
- env->allow_ptr_leaks) {
- /* reg_imm += K|X */
- return evaluate_reg_imm_alu(env, insn);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->dst_reg);
- return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_X &&
- is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->src_reg);
- return -EACCES;
- }
-
- /* If we did pointer math on a map value then just set it to our
- * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
- * loads to this register appropriately, otherwise just mark the
- * register as unknown.
- */
- if (env->allow_ptr_leaks &&
- BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
- (dst_reg->type == PTR_TO_MAP_VALUE ||
- dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
- dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
- else
- mark_reg_unknown_value(regs, insn->dst_reg);
+ return adjust_reg_min_max_vals(env, insn);
}
return 0;
@@ -2177,27 +2426,48 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *regs = state->regs, *reg;
int i;
- /* LLVM can generate two kind of checks:
+ if (dst_reg->off < 0)
+ /* This doesn't give us any range */
+ return;
+
+ if (dst_reg->umax_value > MAX_PACKET_OFF ||
+ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ /* Risk of overflow. For instance, ptr + (1<<63) may be less
+ * than pkt_end, but that's because it's also less than pkt.
+ */
+ return;
+
+ /* LLVM can generate four kind of checks:
*
- * Type 1:
+ * Type 1/2:
*
* r2 = r3;
* r2 += 8;
* if (r2 > pkt_end) goto <handle exception>
* <access okay>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 < pkt_end) goto <access okay>
+ * <handle exception>
+ *
* Where:
* r2 == dst_reg, pkt_end == src_reg
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
- * Type 2:
+ * Type 3/4:
*
* r2 = r3;
* r2 += 8;
* if (pkt_end >= r2) goto <access okay>
* <handle exception>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end <= r2) goto <handle exception>
+ * <access okay>
+ *
* Where:
* pkt_end == dst_reg, r2 == src_reg
* r2=pkt(id=n,off=8,r=0)
@@ -2207,193 +2477,247 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* so that range of bytes [r3, r3 + 8) is safe to access.
*/
+ /* If our ids match, then we must have the same max_value. And we
+ * don't care about the other reg's fixed offset, since if it's too big
+ * the range won't allow anything.
+ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ */
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, dst_reg->off);
+ regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = max(reg->range, dst_reg->off);
+ reg->range = max_t(u16, reg->range, dst_reg->off);
}
}
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
+ * In JEQ/JNE cases we also adjust the var_off values.
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ /* If the dst_reg is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless src_reg were a pointer into
+ * the same object, but we don't bother with that.
+ * Since false_reg and true_reg have the same type by construction, we
+ * only need to check one of them for pointerness.
+ */
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum val is val,
- * otherwise we know the min val is val+1.
- */
- false_reg->max_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val + 1;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ break;
+ case BPF_JLT:
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLT:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum value is val - 1,
- * otherwise we know the mimimum value is val.
- */
- false_reg->max_value = val - 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ break;
+ case BPF_JLE:
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ break;
+ case BPF_JSLE:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
- }
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
}
-/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
- * is the variable reg.
+/* Same as above, but for the case that dst_reg holds a constant and src_reg is
+ * the variable reg.
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /*
- * If this is false, then the val is <= the register, if it is
- * true the register <= to the val.
- */
- false_reg->min_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val - 1;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ break;
+ case BPF_JLT:
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ break;
+ case BPF_JSLT:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /* If this is false then constant < register, if it is true then
- * the register < constant.
- */
- false_reg->min_value = val + 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ break;
+ case BPF_JLE:
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLE:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
+}
+
+/* Regs are known to be equal, so intersect their min/max/var_off */
+static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
+ struct bpf_reg_state *dst_reg)
+{
+ src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
+ dst_reg->umin_value);
+ src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
+ dst_reg->umax_value);
+ src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
+ dst_reg->smin_value);
+ src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
+ dst_reg->smax_value);
+ src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
+ dst_reg->var_off);
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(src_reg);
+ __reg_deduce_bounds(dst_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(src_reg);
+ __reg_bound_offset(dst_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static void reg_combine_min_max(struct bpf_reg_state *true_src,
+ struct bpf_reg_state *true_dst,
+ struct bpf_reg_state *false_src,
+ struct bpf_reg_state *false_dst,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ __reg_combine_min_max(true_src, true_dst);
+ break;
+ case BPF_JNE:
+ __reg_combine_min_max(false_src, false_dst);
+ break;
}
}
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- if (type == UNKNOWN_VALUE) {
- __mark_reg_unknown_value(regs, regno);
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+ !tnum_equals_const(reg->var_off, 0) ||
+ reg->off)) {
+ __mark_reg_known_zero(reg);
+ reg->off = 0;
+ }
+ if (is_null) {
+ reg->type = SCALAR_VALUE;
} else if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
} else {
- reg->type = type;
+ reg->type = PTR_TO_MAP_VALUE;
}
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
@@ -2407,19 +2731,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
* be folded together at some point.
*/
static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *regs = state->regs;
u32 id = regs[regno].id;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- mark_map_reg(regs, i, id, type);
+ mark_map_reg(regs, i, id, is_null);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
}
}
@@ -2431,7 +2755,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
int err;
- if (opcode > BPF_EXIT) {
+ if (opcode > BPF_JSLE) {
verbose("invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
@@ -2443,7 +2767,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
@@ -2460,7 +2784,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2469,7 +2793,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
+ dst_reg->type == SCALAR_VALUE &&
+ tnum_equals_const(dst_reg->var_off, insn->imm)) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -2491,17 +2816,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
+ * this is only legit if both are scalars (or pointers to the same
+ * object, I suppose, but we don't support that right now), because
+ * otherwise the different base pointers mean the offsets aren't
+ * comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
- if (regs[insn->src_reg].type == CONST_IMM)
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].imm,
- opcode);
- else if (dst_reg->type == CONST_IMM)
- reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
- &regs[insn->src_reg], dst_reg->imm,
- opcode);
- } else {
+ if (dst_reg->type == SCALAR_VALUE &&
+ regs[insn->src_reg].type == SCALAR_VALUE) {
+ if (tnum_is_const(regs[insn->src_reg].var_off))
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].var_off.value,
+ opcode);
+ else if (tnum_is_const(dst_reg->var_off))
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg],
+ dst_reg->var_off.value, opcode);
+ else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ /* Comparing for equality, we can combine knowledge */
+ reg_combine_min_max(&other_branch->regs[insn->src_reg],
+ &other_branch->regs[insn->dst_reg],
+ &regs[insn->src_reg],
+ &regs[insn->dst_reg], opcode);
+ }
+ } else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch->regs[insn->dst_reg],
dst_reg, insn->imm, opcode);
}
@@ -2513,18 +2851,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* Mark all identical map registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
- mark_map_regs(this_branch, insn->dst_reg,
- opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
- mark_map_regs(other_branch, insn->dst_reg,
- opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
+ mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+ dst_reg->type == PTR_TO_PACKET &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+ find_good_pkt_pointers(other_branch, dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
@@ -2557,16 +2901,15 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = imm;
- regs[insn->dst_reg].id = 0;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
@@ -2624,7 +2967,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_6, SRC_OP);
if (err)
return err;
@@ -2635,19 +2978,22 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (mode == BPF_IND) {
/* check explicit source operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
/* mark destination R0 register as readable, since it contains
- * the value fetched from the packet
+ * the value fetched from the packet.
+ * Already marked as written above.
*/
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ mark_reg_unknown(regs, BPF_REG_0);
return 0;
}
@@ -2850,57 +3196,144 @@ err_free:
return ret;
}
-/* the following conditions reduce the number of explored insns
- * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+/* check %cur's range satisfies %old's */
+static bool range_within(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value;
+}
+
+/* Maximum number of register states that can exist at once */
+#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+struct idpair {
+ u32 old;
+ u32 cur;
+};
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
- struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
{
- if (old->id != cur->id)
- return false;
+ unsigned int i;
+
+ for (i = 0; i < ID_MAP_SIZE; i++) {
+ if (!idmap[i].old) {
+ /* Reached an empty slot; haven't seen this id before */
+ idmap[i].old = old_id;
+ idmap[i].cur = cur_id;
+ return true;
+ }
+ if (idmap[i].old == old_id)
+ return idmap[i].cur == cur_id;
+ }
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
- /* old ptr_to_packet is more conservative, since it allows smaller
- * range. Ex:
- * old(off=0,r=10) is equal to cur(off=0,r=20), because
- * old(off=0,r=10) means that with range=10 the verifier proceeded
- * further and found no issues with the program. Now we're in the same
- * spot with cur(off=0,r=20), so we're safe too, since anything further
- * will only be looking at most 10 bytes after this pointer.
- */
- if (old->off == cur->off && old->range < cur->range)
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct idpair *idmap)
+{
+ if (!(rold->live & REG_LIVE_READ))
+ /* explored state didn't use this */
return true;
- /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
- * since both cannot be used for packet access and safe(old)
- * pointer has smaller off that could be used for further
- * 'if (ptr > data_end)' check
- * Ex:
- * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
- * that we cannot access the packet.
- * The safe range is:
- * [ptr, ptr + range - off)
- * so whenever off >=range, it means no safe bytes from this pointer.
- * When comparing old->off <= cur->off, it means that older code
- * went with smaller offset and that offset was later
- * used to figure out the safe range after 'if (ptr > data_end)' check
- * Say, 'old' state was explored like:
- * ... R3(off=0, r=0)
- * R4 = R3 + 20
- * ... now R4(off=20,r=0) <-- here
- * if (R4 > data_end)
- * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
- * ... the code further went all the way to bpf_exit.
- * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
- * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
- * goes further, such cur_R4 will give larger safe packet range after
- * 'if (R4 > data_end)' and all further insn were already good with r=20,
- * so they will be good with r=30 and we can prune the search.
- */
- if (!env->strict_alignment && old->off <= cur->off &&
- old->off >= old->range && cur->off >= cur->range)
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
return true;
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ if (rcur->type == NOT_INIT)
+ return false;
+ switch (rold->type) {
+ case SCALAR_VALUE:
+ if (rcur->type == SCALAR_VALUE) {
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ } else {
+ /* if we knew anything about the old value, we're not
+ * equal, because we can't know anything about the
+ * scalar value of the pointer in the new value.
+ */
+ return rold->umin_value == 0 &&
+ rold->umax_value == U64_MAX &&
+ rold->smin_value == S64_MIN &&
+ rold->smax_value == S64_MAX &&
+ tnum_is_unknown(rold->var_off);
+ }
+ case PTR_TO_MAP_VALUE:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ * We don't care about the 'id' value, because nothing
+ * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET:
+ if (rcur->type != PTR_TO_PACKET)
+ return false;
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range > rcur->range)
+ return false;
+ /* If the offsets don't match, we can't trust our alignment;
+ * nor can we be sure that we won't fall out of range.
+ */
+ if (rold->off != rcur->off)
+ return false;
+ /* id relations must be preserved */
+ if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_CTX:
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_END:
+ /* Only valid matches are exact, which memcmp() above
+ * would have accepted
+ */
+ default:
+ /* Don't know what's going on, just say it's not safe */
+ return false;
+ }
+
+ /* Shouldn't get here; if we do, say it's not safe */
+ WARN_ON_ONCE(1);
return false;
}
@@ -2934,44 +3367,18 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
struct bpf_verifier_state *cur)
{
- bool varlen_map_access = env->varlen_map_value_access;
- struct bpf_reg_state *rold, *rcur;
+ struct idpair *idmap;
+ bool ret = false;
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- rold = &old->regs[i];
- rcur = &cur->regs[i];
-
- if (memcmp(rold, rcur, sizeof(*rold)) == 0)
- continue;
-
- /* If the ranges were not the same, but everything else was and
- * we didn't do a variable access into a map then we are a-ok.
- */
- if (!varlen_map_access &&
- memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
- continue;
-
- /* If we didn't map access then again we don't care about the
- * mismatched range values and it's ok if our old type was
- * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
- */
- if (rold->type == NOT_INIT ||
- (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
- rcur->type != NOT_INIT))
- continue;
-
- /* Don't care about the reg->id in this case. */
- if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rold->map_ptr == rcur->map_ptr)
- continue;
-
- if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(env, rold, rcur))
- continue;
-
+ idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+ /* If we failed to allocate the idmap, just say it's not safe */
+ if (!idmap)
return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
+ goto out_free;
}
for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -2983,35 +3390,104 @@ static bool states_equal(struct bpf_verifier_env *env,
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
if (i % BPF_REG_SIZE)
continue;
if (old->stack_slot_type[i] != STACK_SPILL)
continue;
- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- sizeof(old->spilled_regs[0])))
- /* when explored and current stack slot types are
- * the same, check that stored pointers types
+ if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
* but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
else
continue;
}
- return true;
+ ret = true;
+out_free:
+ kfree(idmap);
+ return ret;
+}
+
+/* A write screens off any subsequent reads; but write marks come from the
+ * straight-line code between a state and its parent. When we arrive at a
+ * jump target (in the first iteration of the propagate_liveness() loop),
+ * we didn't arrive by the straight-line code, so read marks in state must
+ * propagate to parent regardless of state's write marks.
+ */
+static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ bool writes = parent == state->parent; /* Observe write marks */
+ bool touched = false; /* any changes made? */
+ int i;
+
+ if (!parent)
+ return touched;
+ /* Propagate read liveness of registers... */
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+ /* We don't need to worry about FP liveness because it's read-only */
+ for (i = 0; i < BPF_REG_FP; i++) {
+ if (parent->regs[i].live & REG_LIVE_READ)
+ continue;
+ if (writes && (state->regs[i].live & REG_LIVE_WRITTEN))
+ continue;
+ if (state->regs[i].live & REG_LIVE_READ) {
+ parent->regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ /* ... and stack slots */
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
+ if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ continue;
+ if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ continue;
+ if (state->spilled_regs[i].live & REG_LIVE_READ) {
+ parent->spilled_regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ return touched;
+}
+
+/* "parent" is "a state from which we reach the current state", but initially
+ * it is not the state->parent (i.e. "the state whose straight-line code leads
+ * to the current state"), instead it is the state that happened to arrive at
+ * a (prunable) equivalent of the current state. See comment above
+ * do_propagate_liveness() for consequences of this.
+ * This function is just a more efficient way of calling mark_reg_read() or
+ * mark_stack_slot_read() on each reg in "parent" that is read in "state",
+ * though it requires that parent != state->parent in the call arguments.
+ */
+static void propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ while (do_propagate_liveness(state, parent)) {
+ /* Something changed, so we need to feed those changes onward */
+ state = parent;
+ parent = state->parent;
+ }
}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
+ int i;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3021,11 +3497,20 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state)) {
/* reached equivalent register/stack state,
- * prune the search
+ * prune the search.
+ * Registers read by the continuation are read by us.
+ * If we have any write marks in env->cur_state, they
+ * will prevent corresponding reads in the continuation
+ * from reaching our parent (an explored_state). Our
+ * own state will get the read marks recorded, but
+ * they'll be immediately forgotten as we're pruning
+ * this state and will pop a new one.
*/
+ propagate_liveness(&sl->state, &env->cur_state);
return 1;
+ }
sl = sl->next;
}
@@ -3043,6 +3528,19 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
+ /* connect new state to parentage chain */
+ env->cur_state.parent = &new_sl->state;
+ /* clear write marks in current state: the writes we did are not writes
+ * our child did, so they don't screen off its reads from us.
+ * (There are no read marks in current state, because reads always mark
+ * their parent and current state never has children yet. Only
+ * explored_states can get read marks.)
+ */
+ for (i = 0; i < BPF_REG_FP; i++)
+ env->cur_state.regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
+ if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
+ env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
return 0;
}
@@ -3066,8 +3564,8 @@ static int do_check(struct bpf_verifier_env *env)
bool do_print_state = false;
init_reg_state(regs);
+ state->parent = NULL;
insn_idx = 0;
- env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -3136,11 +3634,11 @@ static int do_check(struct bpf_verifier_env *env)
/* check for reserved fields is already done */
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -3190,11 +3688,11 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3225,7 +3723,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3279,7 +3777,7 @@ static int do_check(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
@@ -3319,7 +3817,6 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
- reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
@@ -3678,7 +4175,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
- if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * handlers are currently limited to 64 bit only.
+ */
+ if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON ||
!map_ptr->ops->map_gen_lookup)
@@ -3703,6 +4204,27 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_redirect_map) {
+ /* Note, we cannot use prog directly as imm as subsequent
+ * rewrites would still change the prog pointer. The only
+ * stable address we can use is aux, which also works with
+ * prog clones during blinding.
+ */
+ u64 addr = (unsigned long)prog->aux;
+ struct bpf_insn r4_ld[] = {
+ BPF_LD_IMM64(BPF_REG_4, addr),
+ *insn,
+ };
+ cnt = ARRAY_SIZE(r4_ld);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
patch_call_imm:
fn = prog->aux->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 8b4c3c2f2509..5151ff256c29 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -156,6 +156,8 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
+bool cgroup_is_thread_root(struct cgroup *cgrp);
+bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -173,7 +175,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns);
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx);
@@ -183,10 +185,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup);
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off);
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7bf4b1533f34..024085daab1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (cgroup_on_dfl(to))
return -EINVAL;
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(to);
+ if (ret)
+ return ret;
mutex_lock(&cgroup_mutex);
@@ -121,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* ->can_attach() fails.
*/
do {
- css_task_iter_start(&from->self, &it);
+ css_task_iter_start(&from->self, 0, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
@@ -373,7 +374,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
@@ -510,10 +511,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return 0;
}
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool threadgroup)
{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
+ struct cgroup *cgrp;
+ struct task_struct *task;
+ const struct cred *cred, *tcred;
+ ssize_t ret;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ cred = current_cred();
+ tcred = get_task_cred(task);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, false);
}
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
@@ -592,7 +641,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
+ .write = cgroup1_procs_write,
},
{
.name = "cgroup.clone_children",
@@ -611,7 +660,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
+ .write = cgroup1_tasks_write,
},
{
.name = "notify_on_release",
@@ -701,7 +750,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
rcu_read_unlock();
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
@@ -846,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -900,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->cpuset_clone_children = true;
continue;
}
+ if (!strcmp(token, "cpuset_v2_mode")) {
+ opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ continue;
+ }
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..44857278eb8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ return cgrp->nr_populated_csets;
+}
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
}
/* subsystems visibly enabled on a cgroup */
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
- if (parent)
- return parent->subtree_control;
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- if (parent)
- return parent->subtree_ss_mask;
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
return cgrp->root->subsys_mask;
}
@@ -436,22 +525,12 @@ out_unlock:
return css;
}
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
- css_get(&cgrp->self);
-}
-
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
@@ -570,6 +651,11 @@ struct css_set init_css_set = {
static int css_set_count = 1; /* 1 for init_css_set */
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
@@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
*
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
lockdep_assert_held(&css_set_lock);
do {
- bool trigger;
+ bool was_populated = cgroup_is_populated(cgrp);
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
- if (!trigger)
+ if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
+ child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
@@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
@@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
@@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
if (!refcount_dec_and_test(&cset->refcount))
return;
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
@@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
kfree(link);
}
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
kfree_rcu(cset, rcu_head);
}
@@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
+ struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
@@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
@@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
spin_unlock_irq(&css_set_lock);
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
return cset;
}
@@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -1737,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -2168,21 +2311,52 @@ out_release_tset:
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
return ret;
}
/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
*/
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- !dst_cgrp->subtree_control;
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(dst_cgrp))
+ return 0;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
}
/**
@@ -2387,8 +2561,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
struct task_struct *task;
int ret;
- if (!cgroup_may_migrate_to(dst_cgrp))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2415,96 +2590,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret;
}
-static int cgroup_procs_write_permission(struct task_struct *task,
- struct cgroup *dst_cgrp,
- struct kernfs_open_file *of)
-{
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
- struct cgroup *src_cgrp, *com_cgrp;
- struct inode *inode;
- int ret;
-
- if (!cgroup_on_dfl(dst_cgrp)) {
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
-
- /*
- * even if we're attaching all tasks in the thread group,
- * we only need to check permissions on one of them.
- */
- if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->euid, tcred->suid))
- ret = 0;
- else
- ret = -EACCES;
-
- put_cred(tcred);
- return ret;
- }
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* and the common ancestor */
- com_cgrp = src_cgrp;
- while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
- com_cgrp = cgroup_parent(com_cgrp);
-
- /* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- if (ret)
- return ret;
-
- /*
- * If namespaces are delegation boundaries, %current must be able
- * to see both source and destination cgroups from its namespace.
- */
- if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
- (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
- !cgroup_is_descendant(dst_cgrp, root_cgrp)))
- return -ENOENT;
-
- return 0;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
pid_t pid;
- int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem);
+
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- ret = -ESRCH;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_threadgroup;
}
} else {
tsk = current;
@@ -2520,35 +2622,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_threadgroup;
}
get_task_struct(tsk);
+ goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
rcu_read_unlock();
+ return tsk;
+}
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
- if (!ret)
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2891,6 +2991,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
@@ -2966,33 +3106,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp)) {
- struct cgrp_cset_link *link;
-
- /*
- * Because namespaces pin csets too, @cgrp->cset_links
- * might not be empty even when @cgrp is empty. Walk and
- * verify each cset.
- */
- spin_lock_irq(&css_set_lock);
-
- ret = 0;
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- if (css_set_populated(link->cset)) {
- ret = -EBUSY;
- break;
- }
- }
-
- spin_unlock_irq(&css_set_lock);
-
- if (ret)
- goto out_unlock;
- }
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
@@ -3011,6 +3127,172 @@ out_unlock:
return ret ?: nbytes;
}
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgrp->dom_cgrp = dom_cgrp;
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+ else
+ cgrp->dom_cgrp = cgrp;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_events_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "populated %d\n",
@@ -3018,6 +3300,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+
+ return 0;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -3234,7 +3528,6 @@ restart:
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
@@ -3659,6 +3952,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
return ret;
}
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
/**
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
@@ -3667,32 +4012,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
+ cset = css_task_iter_next_css_set(it);
+ if (!cset) {
it->task_pos = NULL;
return;
}
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
} while (!css_set_populated(cset));
- it->cset_pos = l;
-
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
@@ -3732,6 +4064,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
WARN_ON_ONCE(!l);
+repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
@@ -3746,11 +4079,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
css_task_iter_advance_css_set(it);
else
it->task_pos = l;
+
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ cg_list)))
+ goto repeat;
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
@@ -3758,7 +4098,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
/* no one should try to iterate before mounting cgroups */
@@ -3769,6 +4109,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
+ it->flags = flags;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3826,6 +4167,9 @@ void css_task_iter_end(struct css_task_iter *it)
spin_unlock_irq(&css_set_lock);
}
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
if (it->cur_task)
put_task_struct(it->cur_task);
}
@@ -3842,16 +4186,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
- struct task_struct *task;
- do {
- task = css_task_iter_next(it);
- } while (task && !thread_group_leader(task));
-
- return task;
+ return css_task_iter_next(it);
}
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3869,24 +4209,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
if (!it)
return ERR_PTR(-ENOMEM);
of->priv = it;
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
} else if (!(*pos)++) {
css_task_iter_end(it);
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
}
return cgroup_procs_next(s, NULL, NULL);
}
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
static int cgroup_procs_show(struct seq_file *s, void *v)
{
- seq_printf(s, "%d\n", task_tgid_vnr(v));
+ seq_printf(s, "%d\n", task_pid_vnr(v));
+ return 0;
+}
+
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *com_cgrp = src_cgrp;
+ struct inode *inode;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
return 0;
}
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, true);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, false);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* thread migrations follow the cgroup.procs delegation rule */
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ /* and must be contained in the same domain */
+ ret = -EOPNOTSUPP;
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
.name = "cgroup.procs",
.flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
@@ -3897,6 +4382,14 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_procs_write,
},
{
+ .name = "cgroup.threads",
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
@@ -3912,6 +4405,20 @@ static struct cftype cgroup_base_files[] = {
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
{ } /* terminate */
};
@@ -4011,9 +4518,15 @@ static void css_release_work_fn(struct work_struct *work)
if (ss->css_released)
ss->css_released(css);
} else {
+ struct cgroup *tcgrp;
+
/* cgroup release path */
trace_cgroup_release(cgrp);
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4100,9 +4613,6 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
- if (ss->css_reset)
- ss->css_reset(css);
-
if (ss->css_offline)
ss->css_offline(css);
@@ -4212,9 +4722,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->root = root;
cgrp->level = level;
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ if (tcgrp != cgrp)
+ tcgrp->nr_descendants++;
+ }
+
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4255,6 +4769,29 @@ out_free_cgrp:
return ERR_PTR(ret);
}
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 1;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level > cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
@@ -4269,6 +4806,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
if (!parent)
return -ENODEV;
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -4420,6 +4962,7 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
@@ -4464,7 +5007,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- cgroup1_check_for_release(cgroup_parent(cgrp));
+ if (parent && cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ }
+
+ cgroup1_check_for_release(parent);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -4659,11 +5210,17 @@ int __init cgroup_init(void)
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
@@ -4708,6 +5265,18 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039bafebb..4657e2924ecb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,6 +56,7 @@
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/oom.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
@@ -300,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -489,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -869,7 +879,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
@@ -903,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -921,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1099,7 +1108,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
@@ -1157,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1175,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1292,7 +1300,7 @@ static void update_tasks_flags(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
@@ -1467,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1986,7 +1994,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2063,7 +2071,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2257,7 +2265,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2267,6 +2275,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2288,7 +2303,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
@@ -2341,8 +2356,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2355,6 +2372,11 @@ void cpuset_update_active_cpus(void)
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2500,12 +2522,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -2528,7 +2550,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
+ * tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -2546,7 +2568,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af22782..f661b4cc5efd 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -114,27 +114,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- int dead_cnt = 0, extra_refs = 0;
+ int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
spin_lock_irq(&css_set_lock);
+
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
int refcnt = refcount_read(&cset->refcount);
- seq_printf(seq, " %d", refcnt);
- if (refcnt - cset->nr_tasks > 0) {
- int extra = refcnt - cset->nr_tasks;
-
- seq_printf(seq, " +%d", extra);
- /*
- * Take out the one additional reference in
- * init_css_set.
- */
- if (cset == &init_css_set)
- extra--;
- extra_refs += extra;
+ /*
+ * Print out the proc_cset and threaded_cset relationship
+ * and highlight difference between refcount and task_count.
+ */
+ seq_printf(seq, "css_set %pK", cset);
+ if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
+ threaded_csets++;
+ seq_printf(seq, "=>%pK", cset->dom_cset);
+ }
+ if (!list_empty(&cset->threaded_csets)) {
+ struct css_set *tcset;
+ int idx = 0;
+
+ list_for_each_entry(tcset, &cset->threaded_csets,
+ threaded_csets_node) {
+ seq_puts(seq, idx ? "," : "<=");
+ seq_printf(seq, "%pK", tcset);
+ idx++;
+ }
+ } else {
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
}
seq_puts(seq, "\n");
@@ -163,10 +185,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
}
spin_unlock_irq(&css_set_lock);
- if (!dead_cnt && !extra_refs)
+ if (!dead_cnt && !extra_refs && !threaded_csets)
return 0;
seq_puts(seq, "\n");
+ if (threaded_csets)
+ seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
if (extra_refs)
seq_printf(seq, "extra references = %d\n", extra_refs);
if (dead_cnt)
@@ -352,6 +376,7 @@ static int __init enable_cgroup_debug(char *str)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
+ debug_cgrp_subsys.threaded = true;
return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..08236798d173 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
rcu_read_unlock();
/* are all tasks frozen? */
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
.free = pids_free,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
+ .threaded = true,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index 6f0a0e723a06..772e038d04d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
-int compat_convert_timespec(struct timespec __user **kts,
- const void __user *cts)
-{
- struct timespec ts;
- struct timespec __user *uts;
-
- if (!cts || COMPAT_USE_64BIT_TIME) {
- *kts = (struct timespec __user *)cts;
- return 0;
- }
-
- uts = compat_alloc_user_space(sizeof(ts));
- if (!uts)
- return -EFAULT;
- if (compat_get_timespec(&ts, cts))
- return -EFAULT;
- if (copy_to_user(uts, &ts, sizeof(ts)))
- return -EFAULT;
-
- *kts = uts;
- return 0;
-}
-
int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
{
struct compat_itimerval v32;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..8de11a29e495 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -46,11 +46,13 @@
* @bringup: Single callback bringup or teardown selector
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
- * @done: Signal completion to the issuer of the task
+ * @done_up: Signal completion to the issuer of the task for cpu-up
+ * @done_down: Signal completion to the issuer of the task for cpu-down
*/
struct cpuhp_cpu_state {
enum cpuhp_state state;
enum cpuhp_state target;
+ enum cpuhp_state fail;
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
@@ -58,18 +60,39 @@ struct cpuhp_cpu_state {
bool single;
bool bringup;
struct hlist_node *node;
+ struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
- struct completion done;
+ struct completion done_up;
+ struct completion done_down;
#endif
};
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+ .fail = CPUHP_INVALID,
+};
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
-static struct lockdep_map cpuhp_state_lock_map =
- STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_up_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+ lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static void inline cpuhp_lock_release(bool bringup)
+{
+ lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
+
#endif
/**
@@ -123,13 +146,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
- * @step: The step in the state machine
+ * @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
+ * @node: For multi-instance, do a single entry callback for install/remove
+ * @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
- bool bringup, struct hlist_node *node)
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +163,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
+ if (st->fail == state) {
+ st->fail = CPUHP_INVALID;
+
+ if (!(bringup ? step->startup.single : step->teardown.single))
+ return 0;
+
+ return -EAGAIN;
+ }
+
if (!step->multi_instance) {
+ WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
@@ -152,6 +188,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* Single invocation for instance add/remove */
if (node) {
+ WARN_ON_ONCE(lastp && *lastp);
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +198,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
+ if (lastp && node == *lastp)
+ break;
+
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
- if (ret)
- goto err;
+ if (ret) {
+ if (!lastp)
+ goto err;
+
+ *lastp = node;
+ return ret;
+ }
cnt++;
}
+ if (lastp)
+ *lastp = NULL;
return 0;
err:
/* Rollback the instances if one failed */
@@ -178,12 +225,39 @@ err:
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
- cbm(cpu, node);
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ /*
+ * Rollback must not fail,
+ */
+ WARN_ON_ONCE(ret);
}
return ret;
}
#ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+ return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -271,14 +345,79 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->target = target;
+ st->single = false;
+ st->bringup = st->state < target;
+
+ return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+ st->rollback = true;
+
+ /*
+ * If we have st->last we need to undo partial multi_instance of this
+ * state first. Otherwise start undo at the previous state.
+ */
+ if (!st->last) {
+ if (st->bringup)
+ st->state--;
+ else
+ st->state++;
+ }
+
+ st->target = prev_state;
+ st->bringup = !st->bringup;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+ if (!st->single && st->state == st->target)
+ return;
+
+ st->result = 0;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_ap_thread(st, st->bringup);
+}
+
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state;
+ int ret;
+
+ prev_state = cpuhp_set_state(st, target);
+ __cpuhp_kick_ap(st);
+ if ((ret = st->result)) {
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
+}
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, true);
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
@@ -286,12 +425,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE) {
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- }
- return st->result;
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -317,32 +454,6 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
- }
-}
-
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
-{
- enum cpuhp_state prev_state = st->state;
- int ret = 0;
-
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
- if (ret) {
- st->target = prev_state;
- undo_cpu_down(cpu, st);
- break;
- }
- }
- return ret;
-}
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
@@ -350,7 +461,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
}
@@ -362,7 +473,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
while (st->state < target) {
st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
st->target = prev_state;
undo_cpu_up(cpu, st);
@@ -379,7 +490,8 @@ static void cpuhp_create(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- init_completion(&st->done);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
}
static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +501,90 @@ static int cpuhp_should_run(unsigned int cpu)
return st->should_run;
}
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-
- return cpuhp_down_callbacks(cpu, st, target);
-}
-
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- return cpuhp_up_callbacks(cpu, st, st->target);
-}
-
/*
* Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
* callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ * - single: runs st->cb_state
+ * - up: runs ++st->state, while st->state < st->target
+ * - down: runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
*/
static void cpuhp_thread_fun(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- int ret = 0;
+ bool bringup = st->bringup;
+ enum cpuhp_state state;
/*
- * Paired with the mb() in cpuhp_kick_ap_work and
- * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+ * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+ * that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (!st->should_run)
+
+ if (WARN_ON_ONCE(!st->should_run))
return;
- st->should_run = false;
+ cpuhp_lock_acquire(bringup);
- lock_map_acquire(&cpuhp_state_lock_map);
- /* Single callback invocation for [un]install ? */
if (st->single) {
- if (st->cb_state < CPUHP_AP_ONLINE) {
- local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
- local_irq_enable();
+ state = st->cb_state;
+ st->should_run = false;
+ } else {
+ if (bringup) {
+ st->state++;
+ state = st->state;
+ st->should_run = (st->state < st->target);
+ WARN_ON_ONCE(st->state > st->target);
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
+ state = st->state;
+ st->state--;
+ st->should_run = (st->state > st->target);
+ WARN_ON_ONCE(st->state < st->target);
}
- } else if (st->rollback) {
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+ }
+
+ WARN_ON_ONCE(!cpuhp_is_ap_state(state));
- undo_cpu_down(cpu, st);
- st->rollback = false;
+ if (st->rollback) {
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ if (step->skip_onerr)
+ goto next;
+ }
+
+ if (cpuhp_is_atomic_state(state)) {
+ local_irq_disable();
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ local_irq_enable();
+
+ /*
+ * STARTING/DYING must not fail!
+ */
+ WARN_ON_ONCE(st->result);
} else {
- /* Cannot happen .... */
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-
- /* Regular hotplug work */
- if (st->state < st->target)
- ret = cpuhp_ap_online(cpu, st);
- else if (st->state > st->target)
- ret = cpuhp_ap_offline(cpu, st);
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ }
+
+ if (st->result) {
+ /*
+ * If we fail on a rollback, we're up a creek without no
+ * paddle, no way forward, no way back. We loose, thanks for
+ * playing.
+ */
+ WARN_ON_ONCE(st->rollback);
+ st->should_run = false;
}
- lock_map_release(&cpuhp_state_lock_map);
- st->result = ret;
- complete(&st->done);
+
+next:
+ cpuhp_lock_release(bringup);
+
+ if (!st->should_run)
+ complete_ap_thread(st, bringup);
}
/* Invoke a single callback on a remote cpu */
@@ -460,62 +593,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
if (!cpu_online(cpu))
return 0;
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
*/
if (!st->thread)
- return cpuhp_invoke_callback(cpu, state, bringup, node);
+ return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+ st->rollback = false;
+ st->last = NULL;
+
+ st->node = node;
+ st->bringup = bringup;
st->cb_state = state;
st->single = true;
- st->bringup = bringup;
- st->node = node;
- /*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
- */
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
- wait_for_completion(&st->done);
- return st->result;
-}
+ __cpuhp_kick_ap(st);
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
- st->result = 0;
- st->single = false;
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * If we failed and did a partial, do a rollback.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
+ if ((ret = st->result) && st->last) {
+ st->rollback = true;
+ st->bringup = !bringup;
+
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
}
static int cpuhp_kick_ap_work(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- enum cpuhp_state state = st->state;
+ enum cpuhp_state prev_state = st->state;
+ int ret;
+
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
+
+ trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+ ret = cpuhp_kick_ap(st, st->target);
+ trace_cpuhp_exit(cpu, st->state, prev_state, ret);
- trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- trace_cpuhp_exit(cpu, st->state, state, st->result);
- return st->result;
+ return ret;
}
static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +716,7 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
+ int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -594,8 +730,13 @@ static int take_cpu_down(void *_param)
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ /*
+ * DYING must not fail!
+ */
+ WARN_ON_ONCE(ret);
+ }
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -639,7 +780,7 @@ static int takedown_cpu(unsigned int cpu)
*
* Wait for the stop thread to go away.
*/
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -658,7 +799,7 @@ static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
- complete(&st->done);
+ complete_ap_thread(st, false);
}
void cpuhp_report_idle_dead(void)
@@ -676,11 +817,32 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
-#else
-#define takedown_cpu NULL
-#endif
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ for (st->state++; st->state < st->target; st->state++) {
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
-#ifdef CONFIG_HOTPLUG_CPU
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_down(cpu, st);
+ break;
+ }
+ }
+ return ret;
+}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -699,13 +861,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = st->state;
- st->target = target;
+ prev_state = cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
*/
if (st->state > CPUHP_TEARDOWN_CPU) {
+ st->target = max((int)target, CPUHP_TEARDOWN_CPU);
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
@@ -720,6 +882,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
+
+ st->target = target;
}
/*
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -727,9 +891,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- st->target = prev_state;
- st->rollback = true;
- cpuhp_kick_ap_work(cpu);
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
}
out:
@@ -754,11 +917,15 @@ out:
cpu_maps_update_done();
return err;
}
+
int cpu_down(unsigned int cpu)
{
return do_cpu_down(cpu, CPUHP_OFFLINE);
}
EXPORT_SYMBOL(cpu_down);
+
+#else
+#define takedown_cpu NULL
#endif /*CONFIG_HOTPLUG_CPU*/
/**
@@ -772,11 +939,16 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
st->state++;
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ /*
+ * STARTING must not fail!
+ */
+ WARN_ON_ONCE(ret);
}
}
@@ -794,7 +966,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
- complete(&st->done);
+ complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
@@ -829,7 +1001,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- st->target = target;
+ cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1296,6 +1468,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
+ /*
+ * If there's nothing to do, we done.
+ * Relies on the union for multi_instance.
+ */
if ((bringup && !sp->startup.single) ||
(!bringup && !sp->teardown.single))
return 0;
@@ -1307,9 +1483,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
if (cpuhp_is_ap_state(state))
ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1641,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
}
static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static ssize_t write_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int fail, ret;
+
+ ret = kstrtoint(buf, 10, &fail);
+ if (ret)
+ return ret;
+
+ /*
+ * Cannot fail STARTING/DYING callbacks.
+ */
+ if (cpuhp_is_atomic_state(fail))
+ return -EINVAL;
+
+ /*
+ * Cannot fail anything that doesn't have callbacks.
+ */
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(fail);
+ if (!sp->startup.single && !sp->teardown.single)
+ ret = -EINVAL;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ st->fail = fail;
+
+ return count;
+}
+
+static ssize_t show_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
&dev_attr_target.attr,
+ &dev_attr_fail.attr,
NULL
};
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..67b02e138a47 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,15 +22,21 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ /*
+ * __atomic_notifier_call_chain has a RCU read critical section, which
+ * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
+ * RCU know this.
+ */
+ rcu_irq_enter_irqson();
+ ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
nr_to_call, nr_calls);
+ rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -100,7 +92,6 @@ int cpu_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -108,7 +99,6 @@ int cpu_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 294f1927f944..6bc21e202ae4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8134,7 +8134,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -8145,7 +8145,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
@@ -8154,13 +8155,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
@@ -8169,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
}
}
event->tp_event->prog = prog;
+ event->tp_event->bpf_prog_owner = event;
return 0;
}
@@ -8183,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
return;
prog = event->tp_event->prog;
- if (prog) {
+ if (prog && event->tp_event->bpf_prog_owner == event) {
event->tp_event->prog = NULL;
bpf_prog_put(prog);
}
@@ -11291,5 +11294,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
* controller is not mounted on a legacy hierarchy.
*/
.implicit_on_dfl = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12ee..f684d8e5fa2b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -412,6 +412,19 @@ err:
return NULL;
}
+static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
+{
+ if (rb->aux_overwrite)
+ return false;
+
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Commit the data written by hardware into the ring buffer by adjusting
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
@@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
}
rb->user_page->aux_head = rb->aux_head;
- if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ if (rb_need_aux_wakeup(rb))
wakeup = true;
- rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
- }
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
rb->user_page->aux_head = rb->aux_head;
- if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
- rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index a35d8a17e01f..f2cd53e92147 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
+
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
@@ -1615,7 +1613,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
@@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err && uru) {
- /* kernel_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- err = copy_to_user(uru, &ru, sizeof(ru));
- else
- err = put_compat_rusage(&ru, uru);
- if (err)
- return -EFAULT;
+ if (uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
}
if (!infop)
@@ -1741,7 +1738,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
diff --git a/kernel/extable.c b/kernel/extable.c
index 38c2412401a1..9aa1cc41ecf7 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr)
int __kernel_text_address(unsigned long addr)
{
- if (core_kernel_text(addr))
- return 1;
- if (is_module_text_address(addr))
- return 1;
- if (is_ftrace_trampoline(addr))
- return 1;
- if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
- if (is_bpf_text_address(addr))
+ if (kernel_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
@@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr)
int kernel_text_address(unsigned long addr)
{
+ bool no_rcu;
+ int ret = 1;
+
if (core_kernel_text(addr))
return 1;
+
+ /*
+ * If a stack dump happens while RCU is not watching, then
+ * RCU needs to be notified that it requires to start
+ * watching again. This can happen either by tracing that
+ * triggers a stack trace, or a WARN() that happens during
+ * coming back from idle, or cpu on or offlining.
+ *
+ * is_module_text_address() as well as the kprobe slots
+ * and is_bpf_text_address() require RCU to be watching.
+ */
+ no_rcu = !rcu_is_watching();
+
+ /* Treat this like an NMI as it can happen anywhere */
+ if (no_rcu)
+ rcu_nmi_enter();
+
if (is_module_text_address(addr))
- return 1;
+ goto out;
if (is_ftrace_trampoline(addr))
- return 1;
+ goto out;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
+ goto out;
if (is_bpf_text_address(addr))
- return 1;
- return 0;
+ goto out;
+ ret = 0;
+out:
+ if (no_rcu)
+ rcu_nmi_exit();
+
+ return ret;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e5345c07344..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -657,7 +658,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = dup_userfaultfd(tmp, &uf);
if (retval)
goto fail_nomem_anon_vma_fork;
- if (anon_vma_fork(tmp, mpnt))
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /* VM_WIPEONFORK gets a clean slate in the child. */
+ tmp->anon_vma = NULL;
+ if (anon_vma_prepare(tmp))
+ goto fail_nomem_anon_vma_fork;
+ } else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +707,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_parent = &tmp->vm_rb;
mm->map_count++;
- retval = copy_page_range(mm, oldmm, mpnt);
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(mm, oldmm, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -818,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
+ hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -897,6 +905,7 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
destroy_context(mm);
+ hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -922,7 +931,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
@@ -941,7 +949,9 @@ EXPORT_SYMBOL_GPL(mmput);
#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
- struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ struct mm_struct *mm = container_of(work, struct mm_struct,
+ async_put_work);
+
__mmput(mm);
}
@@ -1470,8 +1480,7 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- p->pi_waiters = RB_ROOT;
- p->pi_waiters_leftmost = NULL;
+ p->pi_waiters = RB_ROOT_CACHED;
p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
@@ -1578,10 +1587,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- retval = security_task_create(clone_flags);
- if (retval)
- goto fork_out;
-
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..0518a0bfc746 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
- *
- * Must be called with the hb lock held.
*/
static void put_pi_state(struct futex_pi_state *pi_state)
{
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ struct task_struct *owner;
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ owner = pi_state->owner;
+ if (owner) {
+ raw_spin_lock(&owner->pi_lock);
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&owner->pi_lock);
+ }
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
- if (current->pi_state_cache)
+ if (current->pi_state_cache) {
kfree(pi_state);
- else {
+ } else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
continue;
}
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr)
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
- raw_spin_unlock_irq(&curr->pi_lock);
+ raw_spin_unlock(&curr->pi_lock);
get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
pi_state->owner = p;
raw_spin_unlock_irq(&p->pi_lock);
@@ -2878,6 +2888,7 @@ retry:
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
put_pi_state(pi_state);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f51b7b6d2451..6fc89fd93824 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
irqd_clr_managed_shutdown(d);
- if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
+ if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
/*
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..5270a54b9fa4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
/* Calc pointer to the next generic chip */
tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
}
- d->name = name;
return 0;
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 73be2b3909bd..82afb7ed369f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -421,10 +421,8 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- mutex_lock(&sparse_irq_lock);
kobject_del(&desc->kobj);
delete_irq_desc(irq);
- mutex_unlock(&sparse_irq_lock);
/*
* We free the descriptor, masks and stat fields via RCU. That
@@ -462,20 +460,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
- mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
- mutex_unlock(&sparse_irq_lock);
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
@@ -575,6 +568,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc->owner = owner;
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -670,10 +664,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
+ mutex_lock(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
- mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
@@ -720,19 +714,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
- goto err;
+ goto unlock;
if (start + cnt > nr_irqs) {
ret = irq_expand_nr_irqs(start + cnt);
if (ret)
- goto err;
+ goto unlock;
}
-
- bitmap_set(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, affinity, owner);
-
-err:
+ ret = alloc_descs(start, cnt, node, affinity, owner);
+unlock:
mutex_unlock(&sparse_irq_lock);
return ret;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d62351714f3e..ac4644e92b49 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -41,6 +41,9 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
#endif
+const struct fwnode_operations irqchip_fwnode_ops;
+EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
+
/**
* irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
* identifying an irq domain
@@ -86,7 +89,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
fwid->data = data;
- fwid->fwnode.type = FWNODE_IRQCHIP;
+ fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
@@ -193,10 +196,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
if (!domain->name) {
- if (fwnode) {
- pr_err("Invalid fwnode type (%d) for irqdomain\n",
- fwnode->type);
- }
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
atomic_inc_return(&unknown_domains));
if (!domain->name) {
@@ -944,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -1452,7 +1453,7 @@ out_free_desc:
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
{
- void **slot;
+ void __rcu **slot;
if (d->hwirq < d->domain->revmap_size)
return; /* Not using radix tree. */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..d00132b5c325 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
#endif
action = __free_irq(irq, dev_id);
+
+ if (!action)
+ return NULL;
+
devname = action->name;
kfree(action);
return devname;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 48eadf416c24..3fa4bd59f569 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -315,11 +315,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
ops->set_desc(arg, desc);
/* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
+ arg);
if (ret)
break;
- irq_set_msi_desc_off(virq, 0, desc);
+ irq_set_msi_desc_off(desc->irq, 0, desc);
}
if (ret) {
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (filp_epoll) {
filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
fput(filp_epoll);
- } else
+ }
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cd771993f96f..3f693a0f6f3e 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
static const struct file_operations kcov_fops = {
.open = kcov_open,
.unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
.mmap = kcov_mmap,
.release = kcov_close,
};
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2f37acde640b..bc6addd9152b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -1,23 +1,6 @@
/*
- kmod, the new module loader (replaces kerneld)
- Kirk Petersen
-
- Reorganized not to be a daemon by Adam Richter, with guidance
- from Greg Zornetzer.
-
- Modified to avoid chroot and file sharing problems.
- Mikael Pettersson
-
- Limit the concurrent number of kmod modprobes to catch loops from
- "modprobe needs a service that is in a module".
- Keith Owens <kaos@ocs.com.au> December 1999
-
- Unblock all signals when we exec a usermode process.
- Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
-
- call_usermodehelper wait flag, and remove exec_usermodehelper.
- Rusty Russell <rusty@rustcorp.com.au> Jan 2003
-*/
+ * kmod - the kernel module loader
+ */
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -45,15 +28,6 @@
#include <trace/events/module.h>
-#define CAP_BSET (void *)1
-#define CAP_PI (void *)2
-
-static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
-static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
-static DEFINE_SPINLOCK(umh_sysctl_lock);
-static DECLARE_RWSEM(umhelper_sem);
-
-#ifdef CONFIG_MODULES
/*
* Assuming:
*
@@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...)
return ret;
}
EXPORT_SYMBOL(__request_module);
-
-#endif /* CONFIG_MODULES */
-
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away
- * or the caller used UMH_NO_WAIT.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
-/*
- * This is the task which runs the usermode application
- */
-static int call_usermodehelper_exec_async(void *data)
-{
- struct subprocess_info *sub_info = data;
- struct cred *new;
- int retval;
-
- spin_lock_irq(&current->sighand->siglock);
- flush_signal_handlers(current, 1);
- spin_unlock_irq(&current->sighand->siglock);
-
- /*
- * Our parent (unbound workqueue) runs with elevated scheduling
- * priority. Avoid propagating that into the userspace child.
- */
- set_user_nice(current, 0);
-
- retval = -ENOMEM;
- new = prepare_kernel_cred(current);
- if (!new)
- goto out;
-
- spin_lock(&umh_sysctl_lock);
- new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
- new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
- new->cap_inheritable);
- spin_unlock(&umh_sysctl_lock);
-
- if (sub_info->init) {
- retval = sub_info->init(sub_info, new);
- if (retval) {
- abort_creds(new);
- goto out;
- }
- }
-
- commit_creds(new);
-
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
-out:
- sub_info->retval = retval;
- /*
- * call_usermodehelper_exec_sync() will call umh_complete
- * if UHM_WAIT_PROC.
- */
- if (!(sub_info->wait & UMH_WAIT_PROC))
- umh_complete(sub_info);
- if (!retval)
- return 0;
- do_exit(0);
-}
-
-/* Handles UMH_WAIT_PROC. */
-static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
-{
- pid_t pid;
-
- /* If SIGCLD is ignored sys_wait4 won't populate the status. */
- kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- sys_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
-
- /* Restore default kernel sig handler */
- kernel_sigaction(SIGCHLD, SIG_IGN);
-
- umh_complete(sub_info);
-}
-
-/*
- * We need to create the usermodehelper kernel thread from a task that is affine
- * to an optimized set of CPUs (or nohz housekeeping ones) such that they
- * inherit a widest affinity irrespective of call_usermodehelper() callers with
- * possibly reduced affinity (eg: per-cpu workqueues). We don't want
- * usermodehelper targets to contend a busy CPU.
- *
- * Unbound workqueues provide such wide affinity and allow to block on
- * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
- *
- * Besides, workqueues provide the privilege level that caller might not have
- * to perform the usermodehelper request.
- *
- */
-static void call_usermodehelper_exec_work(struct work_struct *work)
-{
- struct subprocess_info *sub_info =
- container_of(work, struct subprocess_info, work);
-
- if (sub_info->wait & UMH_WAIT_PROC) {
- call_usermodehelper_exec_sync(sub_info);
- } else {
- pid_t pid;
- /*
- * Use CLONE_PARENT to reparent it to kthreadd; we do not
- * want to pollute current->children, and we need a parent
- * that always ignores SIGCHLD to ensure auto-reaping.
- */
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
- }
- }
-}
-
-/*
- * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
- * (used for preventing user land processes from being created after the user
- * land has been frozen during a system-wide hibernation or suspend operation).
- * Should always be manipulated under umhelper_sem acquired for write.
- */
-static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
-
-/* Number of helpers running */
-static atomic_t running_helpers = ATOMIC_INIT(0);
-
-/*
- * Wait queue head used by usermodehelper_disable() to wait for all running
- * helpers to finish.
- */
-static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
-
-/*
- * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
- * to become 'false'.
- */
-static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
-
-/*
- * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_disable() fails
- */
-#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-
-int usermodehelper_read_trylock(void)
-{
- DEFINE_WAIT(wait);
- int ret = 0;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_INTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- if (usermodehelper_disabled == UMH_DISABLED)
- ret = -EAGAIN;
-
- up_read(&umhelper_sem);
-
- if (ret)
- break;
-
- schedule();
- try_to_freeze();
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return ret;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
-
-long usermodehelper_read_lock_wait(long timeout)
-{
- DEFINE_WAIT(wait);
-
- if (timeout < 0)
- return -EINVAL;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- up_read(&umhelper_sem);
-
- timeout = schedule_timeout(timeout);
- if (!timeout)
- break;
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return timeout;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-
-void usermodehelper_read_unlock(void)
-{
- up_read(&umhelper_sem);
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
-
-/**
- * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Change the value of usermodehelper_disabled (under umhelper_sem locked for
- * writing) and wakeup tasks waiting for it to change.
- */
-void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
-{
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- wake_up(&usermodehelper_disabled_waitq);
- up_write(&umhelper_sem);
-}
-
-/**
- * __usermodehelper_disable - Prevent new helpers from being started.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
- */
-int __usermodehelper_disable(enum umh_disable_depth depth)
-{
- long retval;
-
- if (!depth)
- return -EINVAL;
-
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- up_write(&umhelper_sem);
-
- /*
- * From now on call_usermodehelper_exec() won't start any new
- * helpers, so it is sufficient if running_helpers turns out to
- * be zero at one point (it may be increased later, but that
- * doesn't matter).
- */
- retval = wait_event_timeout(running_helpers_waitq,
- atomic_read(&running_helpers) == 0,
- RUNNING_HELPERS_TIMEOUT);
- if (retval)
- return 0;
-
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- return -EAGAIN;
-}
-
-static void helper_lock(void)
-{
- atomic_inc(&running_helpers);
- smp_mb__after_atomic();
-}
-
-static void helper_unlock(void)
-{
- if (atomic_dec_and_test(&running_helpers))
- wake_up(&running_helpers_waitq);
-}
-
-/**
- * call_usermodehelper_setup - prepare to call a usermode helper
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * Returns either %NULL on allocation failure, or a subprocess_info
- * structure. This should be passed to call_usermodehelper_exec to
- * exec the process and free the structure.
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
- char **envp, gfp_t gfp_mask,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- struct subprocess_info *sub_info;
- sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
- if (!sub_info)
- goto out;
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-
-#ifdef CONFIG_STATIC_USERMODEHELPER
- sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
-#else
- sub_info->path = path;
-#endif
- sub_info->argv = argv;
- sub_info->envp = envp;
-
- sub_info->cleanup = cleanup;
- sub_info->init = init;
- sub_info->data = data;
- out:
- return sub_info;
-}
-EXPORT_SYMBOL(call_usermodehelper_setup);
-
-/**
- * call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of system workqueues.
- * (ie. it runs with full root capabilities and optimized affinity).
- */
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
-{
- DECLARE_COMPLETION_ONSTACK(done);
- int retval = 0;
-
- if (!sub_info->path) {
- call_usermodehelper_freeinfo(sub_info);
- return -EINVAL;
- }
- helper_lock();
- if (usermodehelper_disabled) {
- retval = -EBUSY;
- goto out;
- }
-
- /*
- * If there is no binary for us to call, then just return and get out of
- * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
- * disable all call_usermodehelper() calls.
- */
- if (strlen(sub_info->path) == 0)
- goto out;
-
- /*
- * Set the completion pointer only if there is a waiter.
- * This makes it possible to use umh_complete to free
- * the data structure in case of UMH_NO_WAIT.
- */
- sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
- sub_info->wait = wait;
-
- queue_work(system_unbound_wq, &sub_info->work);
- if (wait == UMH_NO_WAIT) /* task has freed sub_info */
- goto unlock;
-
- if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
- if (!retval)
- goto wait_done;
-
- /* umh_complete() will see NULL and free sub_info */
- if (xchg(&sub_info->complete, NULL))
- goto unlock;
- /* fallthrough, umh_complete() was already called */
- }
-
- wait_for_completion(&done);
-wait_done:
- retval = sub_info->retval;
-out:
- call_usermodehelper_freeinfo(sub_info);
-unlock:
- helper_unlock();
- return retval;
-}
-EXPORT_SYMBOL(call_usermodehelper_exec);
-
-/**
- * call_usermodehelper() - prepare and start a usermode application
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * This function is the equivalent to use call_usermodehelper_setup() and
- * call_usermodehelper_exec().
- */
-int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
-{
- struct subprocess_info *info;
- gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-
- info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
- NULL, NULL, NULL);
- if (info == NULL)
- return -ENOMEM;
-
- return call_usermodehelper_exec(info, wait);
-}
-EXPORT_SYMBOL(call_usermodehelper);
-
-static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
-
- if (write && (!capable(CAP_SETPCAP) ||
- !capable(CAP_SYS_MODULE)))
- return -EPERM;
-
- /*
- * convert from the global kernel_cap_t to the ulong array to print to
- * userspace if this is a read.
- */
- spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
- spin_unlock(&umh_sysctl_lock);
-
- t = *table;
- t.data = &cap_array;
-
- /*
- * actually read or write and array of ulongs from userspace. Remember
- * these are least significant 32 bits first
- */
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
-
- /*
- * Drop everything not in the new_cap (but don't add things)
- */
- spin_lock(&umh_sysctl_lock);
- if (write) {
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
- }
- spin_unlock(&umh_sysctl_lock);
-
- return 0;
-}
-
-struct ctl_table usermodehelper_table[] = {
- {
- .procname = "bset",
- .data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- {
- .procname = "inheritable",
- .data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- { }
-};
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index ac35e648b0e5..f4a74e78d467 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 649dc9d3951a..6f3dba6e4e9e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_node;
+ struct rb_node **link = &lock->waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- lock->waiters_leftmost = &waiter->tree_entry;
-
rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color(&waiter->tree_entry, &lock->waiters);
+ rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
}
static void
@@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
- if (lock->waiters_leftmost == &waiter->tree_entry)
- lock->waiters_leftmost = rb_next(&waiter->tree_entry);
-
- rb_erase(&waiter->tree_entry, &lock->waiters);
+ rb_erase_cached(&waiter->tree_entry, &lock->waiters);
RB_CLEAR_NODE(&waiter->tree_entry);
}
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_node;
+ struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
-
rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
}
static void
@@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
return;
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
-
- rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
@@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT;
- lock->waiters_leftmost = NULL;
+ lock->waiters = RB_ROOT_CACHED;
if (name && key)
debug_rt_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 8d039b928d61..7453be0485a5 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -45,7 +45,7 @@ struct rt_mutex_waiter {
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
- return !RB_EMPTY_ROOT(&lock->waiters);
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
static inline struct rt_mutex_waiter *
@@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *w;
- w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
- tree_entry);
+ w = rb_entry(lock->waiters.rb_leftmost,
+ struct rt_mutex_waiter, tree_entry);
BUG_ON(w->lock != lock);
return w;
@@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
static inline int task_has_pi_waiters(struct task_struct *p)
{
- return !RB_EMPTY_ROOT(&p->pi_waiters);
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ return rb_entry(p->pi_waiters.rb_leftmost,
+ struct rt_mutex_waiter, pi_tree_entry);
}
#else
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..1fefe6dcafd7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
/*
+ * __rwsem_down_write_failed_common(sem)
+ * rwsem_optimistic_spin(sem)
+ * osq_unlock(sem->osq)
+ * ...
+ * atomic_long_add_return(&sem->count)
+ *
+ * - VS -
+ *
+ * __up_write()
+ * if (atomic_long_sub_return_release(&sem->count) < 0)
+ * rwsem_wake(sem)
+ * osq_is_locked(&sem->osq)
+ *
+ * And __up_write() must observe !osq_is_locked() when it observes the
+ * atomic_long_add_return() in order to not miss a wakeup.
+ *
+ * This boils down to:
+ *
+ * [S.rel] X = 1 [RmW] r0 = (Y += 0)
+ * MB RMB
+ * [RmW] Y += 1 [L] r1 = X
+ *
+ * exists (r0=1 /\ r1=0)
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 39f56c870051..0e4cd64ad2c0 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -362,7 +362,7 @@ static int *get_random_order(int count)
int *order;
int n, r, tmp;
- order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+ order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
if (!order)
return order;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9afdc434fb49..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,13 +11,14 @@
* General Public License for more details.
*/
#include <linux/radix-tree.h>
-#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
@@ -194,18 +195,69 @@ struct page_map {
struct vmem_altmap altmap;
};
-static void pgmap_radix_release(struct resource *res)
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
{
- resource_size_t key, align_start, align_size, align_end;
+ unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+ unsigned long nr_pages, mask;
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
- align_end = align_start + align_size - 1;
+ nr_pages = PHYS_PFN(resource_size(res));
+ if (nr_pages == pgoff)
+ return ULONG_MAX;
+
+ /*
+ * What is the largest aligned power-of-2 range available from
+ * this resource pgoff to the end of the resource range,
+ * considering the alignment of the current pgoff?
+ */
+ mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+ if (!mask)
+ return ULONG_MAX;
+
+ return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+ for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+ pgoff += 1UL << order, order = order_at((res), pgoff))
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct page *page = device_private_entry_to_page(entry);
+
+ /*
+ * The page_fault() callback must migrate page back to system memory
+ * so that CPU can access it. This might fail for various reasons
+ * (device issue, device was unsafely unplugged, ...). When such
+ * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+ *
+ * Note that because memory cgroup charges are accounted to the device
+ * memory, this should never fail because of memory restrictions (but
+ * allocation of regular system page might still fail because we are
+ * out of memory).
+ *
+ * There is a more in-depth description of what that callback can and
+ * cannot do, in include/linux/memremap.h
+ */
+ return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
+static void pgmap_radix_release(struct resource *res)
+{
+ unsigned long pgoff, order;
mutex_lock(&pgmap_lock);
- for (key = res->start; key <= res->end; key += SECTION_SIZE)
- radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ foreach_order_pgoff(res, order, pgoff)
+ radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
mutex_unlock(&pgmap_lock);
+
+ synchronize_rcu();
}
static unsigned long pfn_first(struct page_map *page_map)
@@ -268,7 +320,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
WARN_ON_ONCE(!rcu_read_lock_held());
- page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
return page_map ? &page_map->pgmap : NULL;
}
@@ -293,12 +345,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
- resource_size_t key, align_start, align_size, align_end;
+ resource_size_t align_start, align_size, align_end;
+ unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
- int error, nid, is_ram;
- unsigned long pfn;
+ int error, nid, is_ram, i = 0;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -333,15 +385,20 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
}
pgmap->ref = ref;
pgmap->res = &page_map->res;
+ pgmap->type = MEMORY_DEVICE_HOST;
+ pgmap->page_fault = NULL;
+ pgmap->page_free = NULL;
+ pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
align_end = align_start + align_size - 1;
- for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+
+ foreach_order_pgoff(res, order, pgoff) {
struct dev_pagemap *dup;
rcu_read_lock();
- dup = find_dev_pagemap(key);
+ dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
rcu_read_unlock();
if (dup) {
dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -349,8 +406,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
error = -EBUSY;
break;
}
- error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
- page_map);
+ error = __radix_tree_insert(&pgmap_radix,
+ PHYS_PFN(res->start) + pgoff, order, page_map);
if (error) {
dev_err(dev, "%s: failed: %d\n", __func__, error);
break;
@@ -391,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
list_del(&page->lru);
page->pgmap = pgmap;
percpu_ref_get(ref);
+ if (!(++i % 1024))
+ cond_resched();
}
devres_add(dev, page_map);
return __va(res->start);
@@ -442,3 +501,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
#endif /* CONFIG_ZONE_DEVICE */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ page->mapping = NULL;
+ mem_cgroup_uncharge(page);
+
+ page->pgmap->page_free(page, page->pgmap->data);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/kernel/module.c b/kernel/module.c
index 40f983cbea81..de66ec825992 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2707,21 +2707,21 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
}
#endif /* CONFIG_KALLSYMS */
-static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
+static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
{
if (!debug)
return;
#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, debug->modname))
+ if (ddebug_add_module(debug, num, mod->name))
pr_err("dynamic debug error adding module: %s\n",
debug->modname);
#endif
}
-static void dynamic_debug_remove(struct _ddebug *debug)
+static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
{
if (debug)
- ddebug_remove_module(debug->modname);
+ ddebug_remove_module(mod->name);
}
void * __weak module_alloc(unsigned long size)
@@ -3715,7 +3715,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_arch_cleanup;
}
- dynamic_debug_setup(info->debug, info->num_debug);
+ dynamic_debug_setup(mod, info->debug, info->num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
@@ -3779,7 +3779,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_disable_nx(mod);
ddebug_cleanup:
- dynamic_debug_remove(info->debug);
+ dynamic_debug_remove(mod, info->debug);
synchronize_sched();
kfree(mod->args);
free_arch_cleanup:
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return scnprintf(buffer, PAGE_SIZE, format, \
+ return scnprintf(buffer, PAGE_SIZE, format "\n", \
*((type *)kp->arg)); \
} \
const struct kernel_param_ops param_ops_##name = { \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
struct kernel_param p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ /* Replace \n with comma */
if (i)
- buffer[off++] = ',';
+ buffer[off - 1] = ',';
p.arg = arr->elem + arr->elemsize * i;
check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- return strlcpy(buffer, kps->string, kps->maxlen);
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
kernel_param_unlock(mk->mod);
- if (count > 0) {
- strcat(buf, "\n");
- ++count;
- }
return count;
}
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a7255b4d..4918314893bc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
int i;
int err;
+ err = -EINVAL;
+ if (!in_userns(parent_pid_ns->user_ns, user_ns))
+ goto out;
+
err = -ENOSPC;
if (level > MAX_PID_NS_LEVEL)
goto out;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e1914c7b85b1..a5c36e9c56a6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -651,7 +651,7 @@ static int load_image_and_restore(void)
int error;
unsigned int flags;
- pr_debug("Loading hibernation image.\n");
+ pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
@@ -681,7 +681,7 @@ int hibernate(void)
bool snapshot_test = false;
if (!hibernation_available()) {
- pr_debug("Hibernation not available.\n");
+ pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}
@@ -692,6 +692,7 @@ int hibernate(void)
goto Unlock;
}
+ pr_info("hibernation entry\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error) {
@@ -727,7 +728,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pr_debug("Writing image.\n");
+ pm_pr_dbg("Writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -739,7 +740,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("Image restored successfully.\n");
+ pm_pr_dbg("Image restored successfully.\n");
}
Free_bitmaps:
@@ -747,7 +748,7 @@ int hibernate(void)
Thaw:
unlock_device_hotplug();
if (snapshot_test) {
- pr_debug("Checking hibernation image\n");
+ pm_pr_dbg("Checking hibernation image\n");
error = swsusp_check();
if (!error)
error = load_image_and_restore();
@@ -762,6 +763,8 @@ int hibernate(void)
atomic_inc(&snapshot_device_available);
Unlock:
unlock_system_sleep();
+ pr_info("hibernation exit\n");
+
return error;
}
@@ -811,7 +814,7 @@ static int software_resume(void)
goto Unlock;
}
- pr_debug("Checking hibernation image partition %s\n", resume_file);
+ pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
pr_info("Waiting %dsec before reading resume device ...\n",
@@ -853,10 +856,10 @@ static int software_resume(void)
}
Check_image:
- pr_debug("Hibernation image partition %d:%d present\n",
+ pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("Looking for hibernation image.\n");
+ pm_pr_dbg("Looking for hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
@@ -868,6 +871,7 @@ static int software_resume(void)
goto Unlock;
}
+ pr_info("resume from hibernation\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (error) {
@@ -875,7 +879,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pr_debug("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -884,11 +888,12 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
+ pr_info("resume from hibernation failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
- pr_debug("Hibernation image not present or could not be loaded.\n");
+ pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(FMODE_READ);
@@ -1012,8 +1017,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("Hibernation mode set to '%s'\n",
- hibernation_modes[mode]);
+ pm_pr_dbg("Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
unlock_system_sleep();
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 42bd800a6755..3a2ca9066583 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -150,7 +150,7 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
power_attr(mem_sleep);
#endif /* CONFIG_SUSPEND */
-#ifdef CONFIG_PM_DEBUG
+#ifdef CONFIG_PM_SLEEP_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
@@ -211,7 +211,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#endif /* CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_SLEEP_DEBUG */
#ifdef CONFIG_DEBUG_FS
static char *suspend_step_name(enum suspend_stat_step step)
@@ -361,6 +361,61 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
power_attr_ro(pm_wakeup_irq);
+bool pm_debug_messages_on __read_mostly;
+
+static ssize_t pm_debug_messages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_debug_messages_on);
+}
+
+static ssize_t pm_debug_messages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_debug_messages_on = !!val;
+ return n;
+}
+
+power_attr(pm_debug_messages);
+
+/**
+ * __pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * @defer: Whether or not to use printk_deferred() to print the message.
+ * @fmt: Message format.
+ *
+ * The message will be emitted if enabled through the pm_debug_messages
+ * sysfs attribute.
+ */
+void __pm_pr_dbg(bool defer, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!pm_debug_messages_on)
+ return;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (defer)
+ printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
+ else
+ printk(KERN_DEBUG "PM: %pV", &vaf);
+
+ va_end(args);
+}
+
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -691,12 +746,11 @@ static struct attribute * g[] = {
&wake_lock_attr.attr,
&wake_unlock_attr.attr,
#endif
-#ifdef CONFIG_PM_DEBUG
- &pm_test_attr.attr,
-#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_test_attr.attr,
&pm_print_times_attr.attr,
&pm_wakeup_irq_attr.attr,
+ &pm_debug_messages_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7fdc40d31b7d..1d2d761e3c25 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -192,7 +192,6 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
extern const char * const pm_labels[];
extern const char *pm_states[];
extern const char *mem_sleep_states[];
-extern suspend_state_t mem_sleep_current;
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
@@ -245,7 +244,11 @@ enum {
#define TEST_FIRST TEST_NONE
#define TEST_MAX (__TEST_AFTER_LAST - 1)
+#ifdef CONFIG_PM_SLEEP_DEBUG
extern int pm_test_level;
+#else
+#define pm_test_level (TEST_NONE)
+#endif
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..50f25cb370c6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,8 +20,9 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -202,6 +203,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3ecf275d7e44..3e2b4f519009 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -8,6 +8,8 @@
* This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -33,53 +35,55 @@
#include "power.h"
const char * const pm_labels[] = {
- [PM_SUSPEND_FREEZE] = "freeze",
+ [PM_SUSPEND_TO_IDLE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
const char *pm_states[PM_SUSPEND_MAX];
static const char * const mem_sleep_labels[] = {
- [PM_SUSPEND_FREEZE] = "s2idle",
+ [PM_SUSPEND_TO_IDLE] = "s2idle",
[PM_SUSPEND_STANDBY] = "shallow",
[PM_SUSPEND_MEM] = "deep",
};
const char *mem_sleep_states[PM_SUSPEND_MAX];
-suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+suspend_state_t pm_suspend_target_state;
+EXPORT_SYMBOL_GPL(pm_suspend_target_state);
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
-static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static const struct platform_s2idle_ops *s2idle_ops;
+static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
-enum freeze_state __read_mostly suspend_freeze_state;
-static DEFINE_SPINLOCK(suspend_freeze_lock);
+enum s2idle_states __read_mostly s2idle_state;
+static DEFINE_SPINLOCK(s2idle_lock);
-void freeze_set_ops(const struct platform_freeze_ops *ops)
+void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
lock_system_sleep();
- freeze_ops = ops;
+ s2idle_ops = ops;
unlock_system_sleep();
}
-static void freeze_begin(void)
+static void s2idle_begin(void)
{
- suspend_freeze_state = FREEZE_STATE_NONE;
+ s2idle_state = S2IDLE_STATE_NONE;
}
-static void freeze_enter(void)
+static void s2idle_enter(void)
{
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
- suspend_freeze_state = FREEZE_STATE_ENTER;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_ENTER;
+ spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -87,56 +91,75 @@ static void freeze_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(suspend_freeze_wait_head,
- suspend_freeze_state == FREEZE_STATE_WAKE);
+ wait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
out:
- suspend_freeze_state = FREEZE_STATE_NONE;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_NONE;
+ spin_unlock_irq(&s2idle_lock);
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
static void s2idle_loop(void)
{
- pr_debug("PM: suspend-to-idle\n");
+ pm_pr_dbg("suspend-to-idle\n");
+
+ for (;;) {
+ int error;
+
+ dpm_noirq_begin();
+
+ /*
+ * Suspend-to-idle equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after
+ * all devices have been suspended.
+ */
+ error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
+ if (!error)
+ s2idle_enter();
+
+ dpm_noirq_resume_devices(PMSG_RESUME);
+ if (error && (error != -EBUSY || !pm_wakeup_pending())) {
+ dpm_noirq_end();
+ break;
+ }
- do {
- freeze_enter();
+ if (s2idle_ops && s2idle_ops->wake)
+ s2idle_ops->wake();
- if (freeze_ops && freeze_ops->wake)
- freeze_ops->wake();
+ dpm_noirq_end();
- dpm_resume_noirq(PMSG_RESUME);
- if (freeze_ops && freeze_ops->sync)
- freeze_ops->sync();
+ if (s2idle_ops && s2idle_ops->sync)
+ s2idle_ops->sync();
if (pm_wakeup_pending())
break;
pm_wakeup_clear(false);
- } while (!dpm_suspend_noirq(PMSG_SUSPEND));
+ }
- pr_debug("PM: resume from suspend-to-idle\n");
+ pm_pr_dbg("resume from suspend-to-idle\n");
}
-void freeze_wake(void)
+void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&suspend_freeze_lock, flags);
- if (suspend_freeze_state > FREEZE_STATE_NONE) {
- suspend_freeze_state = FREEZE_STATE_WAKE;
- wake_up(&suspend_freeze_wait_head);
+ spin_lock_irqsave(&s2idle_lock, flags);
+ if (s2idle_state > S2IDLE_STATE_NONE) {
+ s2idle_state = S2IDLE_STATE_WAKE;
+ wake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+ spin_unlock_irqrestore(&s2idle_lock, flags);
}
-EXPORT_SYMBOL_GPL(freeze_wake);
+EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
@@ -152,19 +175,19 @@ void __init pm_states_init(void)
{
/* "mem" and "freeze" are always present in /sys/power/state. */
pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
+ pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE];
/*
* Suspend-to-idle should be supported even without any suspend_ops,
* initialize mem_sleep_states[] accordingly here.
*/
- mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
+ mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE];
}
static int __init mem_sleep_default_setup(char *str)
{
suspend_state_t state;
- for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+ for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
@@ -193,7 +216,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
}
if (valid_state(PM_SUSPEND_MEM)) {
mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
- if (mem_sleep_default == PM_SUSPEND_MEM)
+ if (mem_sleep_default >= PM_SUSPEND_MEM)
mem_sleep_current = PM_SUSPEND_MEM;
}
@@ -216,49 +239,49 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
}
static int platform_suspend_prepare(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ?
suspend_ops->prepare() : 0;
}
static int platform_suspend_prepare_late(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
- freeze_ops->prepare() : 0;
+ return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ?
+ s2idle_ops->prepare() : 0;
}
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
suspend_ops->wake();
}
static void platform_resume_early(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
- freeze_ops->restore();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore)
+ s2idle_ops->restore();
}
static void platform_resume_finish(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish)
suspend_ops->finish();
}
static int platform_suspend_begin(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
- return freeze_ops->begin();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin)
+ return s2idle_ops->begin();
else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
@@ -267,21 +290,21 @@ static int platform_suspend_begin(suspend_state_t state)
static void platform_resume_end(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end)
+ s2idle_ops->end();
else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
static void platform_recover(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover)
suspend_ops->recover();
}
static bool platform_suspend_again(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ?
suspend_ops->suspend_again() : false;
}
@@ -370,16 +393,21 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- pr_err("PM: late suspend of devices failed\n");
+ pr_err("late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
goto Devices_early_resume;
+ if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
+ s2idle_loop();
+ goto Platform_early_resume;
+ }
+
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
- pr_err("PM: noirq suspend of devices failed\n");
+ pr_err("noirq suspend of devices failed\n");
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -389,17 +417,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- /*
- * PM_SUSPEND_FREEZE equals
- * frozen processes + suspended devices + idle processors.
- * Thus we should invoke freeze_enter() soon after
- * all the devices are suspended.
- */
- if (state == PM_SUSPEND_FREEZE) {
- s2idle_loop();
- goto Platform_early_resume;
- }
-
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -456,6 +473,8 @@ int suspend_devices_and_enter(suspend_state_t state)
if (!sleep_state_supported(state))
return -ENOSYS;
+ pm_suspend_target_state = state;
+
error = platform_suspend_begin(state);
if (error)
goto Close;
@@ -464,7 +483,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
- pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ pr_err("Some devices failed to suspend, or early wake event detected\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -485,6 +504,7 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
platform_resume_end(state);
+ pm_suspend_target_state = PM_SUSPEND_ON;
return error;
Recover_platform:
@@ -518,10 +538,10 @@ static int enter_state(suspend_state_t state)
int error;
trace_suspend_resume(TPS("suspend_enter"), state, true);
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
+ pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
#endif
@@ -531,18 +551,18 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_FREEZE)
- freeze_begin();
+ if (state == PM_SUSPEND_TO_IDLE)
+ s2idle_begin();
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- pr_info("PM: Syncing filesystems ... ");
+ pr_info("Syncing filesystems ... ");
sys_sync();
pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
- pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+ pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
@@ -552,13 +572,13 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
+ pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
Finish:
- pr_debug("PM: Finishing wakeup.\n");
+ pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
@@ -579,6 +599,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -586,6 +607,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pr_info("suspend exit\n");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 5db217051232..6a897e8b2a88 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -104,9 +104,9 @@ repeat:
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status < 0)
- state = PM_SUSPEND_FREEZE;
+ state = PM_SUSPEND_TO_IDLE;
}
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d22571f306..d7cdc426ee38 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio)
if (bio->bi_status) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
}
@@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = hib_resume_bdev;
+ bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fc47863f629c..512f7c2baedd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -649,7 +649,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, int source)
+static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
-EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
@@ -1435,7 +1434,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = check_syslog_permissions(type, source);
if (error)
- goto out;
+ return error;
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
@@ -1443,20 +1442,16 @@ int do_syslog(int type, char __user *buf, int len, int source)
case SYSLOG_ACTION_OPEN: /* Open log */
break;
case SYSLOG_ACTION_READ: /* Read from log */
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
error = wait_event_interruptible(log_wait,
syslog_seq != log_next_seq);
if (error)
- goto out;
+ return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
@@ -1465,16 +1460,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* FALL THRU */
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
@@ -1496,15 +1487,13 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Set level of messages printed to console */
case SYSLOG_ACTION_CONSOLE_LEVEL:
- error = -EINVAL;
if (len < 1 || len > 8)
- goto out;
+ return -EINVAL;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
saved_console_loglevel = LOGLEVEL_DEFAULT;
- error = 0;
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
@@ -1526,7 +1515,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
u64 seq = syslog_seq;
u32 idx = syslog_idx;
- error = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
@@ -1546,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = -EINVAL;
break;
}
-out:
+
return error;
}
@@ -1698,10 +1686,10 @@ asmlinkage int vprintk_emit(int facility, int level,
{
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
- size_t text_len = 0;
+ size_t text_len;
enum log_flags lflags = 0;
unsigned long flags;
- int printed_len = 0;
+ int printed_len;
bool in_sched = false;
if (level == LOGLEVEL_SCHED) {
@@ -1754,7 +1742,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
+ printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len);
logbuf_unlock_irqrestore(flags);
@@ -2650,9 +2638,8 @@ void __init console_init(void)
* makes it difficult to diagnose problems that occur during this time.
*
* To mitigate this problem somewhat, only unregister consoles whose memory
- * intersects with the init section. Note that code exists elsewhere to get
- * rid of the boot console as soon as the proper console shows up, so there
- * won't be side-effects from postponing the removal.
+ * intersects with the init section. Note that all other boot consoles will
+ * get unregistred when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
@@ -2660,16 +2647,23 @@ static int __init printk_late_init(void)
int ret;
for_each_console(con) {
- if (!keep_bootcon && con->flags & CON_BOOT) {
+ if (!(con->flags & CON_BOOT))
+ continue;
+
+ /* Check addresses that might be used for enabled consoles. */
+ if (init_section_intersects(con, sizeof(*con)) ||
+ init_section_contains(con->write, 0) ||
+ init_section_contains(con->read, 0) ||
+ init_section_contains(con->device, 0) ||
+ init_section_contains(con->unblank, 0) ||
+ init_section_contains(con->data, 0)) {
/*
- * Make sure to unregister boot consoles whose data
- * resides in the init section before the init section
- * is discarded. Boot consoles whose data will stick
- * around will automatically be unregistered when the
- * proper console replaces them.
+ * Please, consider moving the reported consoles out
+ * of the init section.
*/
- if (init_section_intersects(con, sizeof(*con)))
- unregister_console(con);
+ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
+ con->name, con->index);
+ unregister_console(con);
}
}
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 60f356d91060..84b1367935e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
- if (copy_siginfo_to_user32(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user32(uinfo, &info)) {
ret = -EFAULT;
break;
}
@@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
{
siginfo_t __user *uinfo = (siginfo_t __user *) data;
- if (copy_siginfo_to_user(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user(uinfo, &info)) {
ret = -EFAULT;
break;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 84fe96641b2e..b0ad62b0e7b8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -882,6 +882,11 @@ void rcu_irq_exit(void)
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
rdtp->dynticks_nesting < 1);
if (rdtp->dynticks_nesting <= 1) {
@@ -1015,6 +1020,11 @@ void rcu_irq_enter(void)
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -4091,7 +4101,7 @@ static void __init rcu_init_geometry(void)
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
- pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
rcu_fanout_leaf, nr_cpu_ids);
/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 55bde94b9572..e012b9be777e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
- pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_BOOST
pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d2c7ff9ba98..d17c5da523a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1173,6 +1173,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -5162,6 +5166,28 @@ void sched_show_task(struct task_struct *p)
put_task_stack(p);
}
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
+
void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
@@ -5184,7 +5210,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
- if (!state_filter || (p->state & state_filter))
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}
@@ -5556,16 +5582,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 29a397067ffa..9209d83ecdcf 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -52,9 +52,11 @@ struct sugov_policy {
struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cpu;
- unsigned long iowait_boost;
- unsigned long iowait_boost_max;
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy. */
@@ -76,6 +78,26 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
+ /*
+ * Since cpufreq_update_util() is called with rq->lock held for
+ * the @target_cpu, our per-cpu data is fully serialized.
+ *
+ * However, drivers cannot in general deal with cross-cpu
+ * requests, so while get_next_freq() will work, our
+ * sugov_update_commit() call may not for the fast switching platforms.
+ *
+ * Hence stop here for remote requests if they aren't supported
+ * by the hardware, as calculating the frequency is pointless if
+ * we cannot in fact act on it.
+ *
+ * For the slow switching platforms, the kthread is always scheduled on
+ * the right set of CPUs and any CPU can find the next frequency and
+ * schedule the kthread.
+ */
+ if (sg_policy->policy->fast_switch_enabled &&
+ !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+ return false;
+
if (sg_policy->work_in_progress)
return false;
@@ -106,7 +128,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
if (policy->fast_switch_enabled) {
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (next_freq == CPUFREQ_ENTRY_INVALID)
+ if (!next_freq)
return;
policy->cur = next_freq;
@@ -154,12 +176,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
{
- struct rq *rq = this_rq();
+ struct rq *rq = cpu_rq(cpu);
unsigned long cfs_max;
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ cfs_max = arch_scale_cpu_capacity(NULL, cpu);
*util = min(rq->cfs.avg.util_avg, cfs_max);
*max = cfs_max;
@@ -169,30 +191,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;
/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
}
}
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned int boost_util, boost_max;
- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;
+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -229,7 +275,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -264,6 +310,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
@@ -290,7 +337,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
raw_spin_lock(&sg_policy->update_lock);
@@ -445,7 +492,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+
+ /* Kthread is bound to all CPUs by default */
+ if (!policy->dvfs_possible_from_any_cpu)
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -528,16 +579,7 @@ static int sugov_init(struct cpufreq_policy *policy)
goto stop_kthread;
}
- if (policy->transition_delay_us) {
- tunables->rate_limit_us = policy->transition_delay_us;
- } else {
- unsigned int lat;
-
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
- }
+ tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -655,6 +697,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
+ .dynamic_switching = true,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
@@ -671,6 +714,11 @@ struct cpufreq_governor *cpufreq_default_governor(void)
static int __init sugov_register(void)
{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(sugov_cpu, cpu).cpu = cpu;
+
return cpufreq_register_governor(&schedutil_gov);
}
fs_initcall(sugov_register);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d05bd9457a40..0191ec7667c3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->rb_leftmost == &dl_se->rb_node;
+ return dl_rq->root.rb_leftmost == &dl_se->rb_node;
}
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
@@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b)
void init_dl_rq(struct dl_rq *dl_rq)
{
- dl_rq->rb_root = RB_ROOT;
+ dl_rq->root = RB_ROOT_CACHED;
#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
@@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq)
dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
- dl_rq->pushable_dl_tasks_root = RB_ROOT;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
@@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct task_struct *entry;
- int leftmost = 1;
+ bool leftmost = true;
BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
@@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
link = &parent->rb_left;
else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost) {
- dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ if (leftmost)
dl_rq->earliest_dl.next = p->dl.deadline;
- }
rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_insert_color_cached(&p->pushable_dl_tasks,
+ &dl_rq->pushable_dl_tasks_root, leftmost);
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
struct rb_node *next_node;
next_node = rb_next(&p->pushable_dl_tasks);
- dl_rq->pushable_dl_tasks_leftmost = next_node;
if (next_node) {
dl_rq->earliest_dl.next = rb_entry(next_node,
struct task_struct, pushable_dl_tasks)->dl.deadline;
}
}
- rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
}
static inline int has_pushable_dl_tasks(struct rq *rq)
{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
static int push_dl_task(struct rq *rq);
@@ -1136,7 +1134,7 @@ static void update_curr_dl(struct rq *rq)
}
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
- struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
@@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node **link = &dl_rq->root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_dl_entity *entry;
int leftmost = 1;
@@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
}
}
- if (leftmost)
- dl_rq->rb_leftmost = &dl_se->rb_node;
-
rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
if (RB_EMPTY_NODE(&dl_se->rb_node))
return;
- if (dl_rq->rb_leftmost == &dl_se->rb_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&dl_se->rb_node);
- dl_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
@@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
struct dl_rq *dl_rq)
{
- struct rb_node *left = dl_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
@@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
if (!has_pushable_dl_tasks(rq))
@@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
struct task_struct, pushable_dl_tasks);
BUG_ON(rq->cpu != task_cpu(p));
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..2f93e4a2d9f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
+__read_mostly bool sched_debug_enabled;
+
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
+ debugfs_create_bool("sched_debug", 0644, NULL,
+ &sched_debug_enabled);
+
return 0;
}
late_initcall(sched_init_debug);
@@ -461,8 +466,6 @@ static char *task_group_path(struct task_group *tg)
}
#endif
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
@@ -530,7 +533,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_lock_irqsave(&rq->lock, flags);
- if (cfs_rq->rb_leftmost)
+ if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8d5868771cb3..70ba32e08a23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
@@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- int leftmost = 1;
+ bool leftmost = true;
/*
* Find the right place in the rbtree:
@@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
@@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
@@ -2803,7 +2790,9 @@ static inline void update_cfs_shares(struct sched_entity *se)
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
- if (&this_rq()->cfs == cfs_rq) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -2820,7 +2809,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq_of(cfs_rq), 0);
+ cpufreq_update_util(rq, 0);
}
}
@@ -4897,7 +4886,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* passed.
*/
if (p->in_iowait)
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
@@ -5435,7 +5424,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
return false;
/* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
return true;
/*
@@ -7719,7 +7708,7 @@ next_group:
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
@@ -8448,6 +8437,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->idle_stamp = rq_clock(this_rq);
/*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
@@ -8554,6 +8549,13 @@ static int active_load_balance_cpu_stop(void *data)
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -9310,7 +9312,7 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6c23e30c0e5c..257f4f0b4532 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
}
/*
- * Suspend-to-idle ("freeze") is a system state in which all user space
+ * Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in iterrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
@@ -167,9 +167,9 @@ static void cpuidle_idle_call(void)
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze() || dev->use_deepest_state) {
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
+ if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle()) {
+ entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf937ef90..0af5ca9e3e3f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq)
return;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab1c7f5409a0..14db76cd496f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -426,8 +426,7 @@ struct cfs_rq {
u64 min_vruntime_copy;
#endif
- struct rb_root tasks_timeline;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
@@ -550,8 +549,7 @@ struct rt_rq {
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached root;
unsigned long dl_nr_running;
@@ -575,8 +573,7 @@ struct dl_rq {
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
- struct rb_root pushable_dl_tasks_root;
- struct rb_node *pushable_dl_tasks_leftmost;
+ struct rb_root_cached pushable_dl_tasks_root;
#else
struct dl_bw dl_bw;
#endif
@@ -1954,6 +1951,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
+extern bool sched_debug_enabled;
+
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
@@ -2074,19 +2073,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);
}
-
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-{
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_update_util(rq, flags);
-}
#else
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6f7b43982f73..f1cf4f306a82 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_debug_enabled;
-
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = 1;
+ sched_debug_enabled = true;
return 0;
}
@@ -473,7 +471,7 @@ static int __init isolated_cpu_setup(char *str)
alloc_bootmem_cpumask_var(&cpu_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
return 0;
}
return 1;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..98feab7933c7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
}
EXPORT_SYMBOL(remove_wait_queue);
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key,
+ wait_queue_entry_t *bookmark)
{
wait_queue_entry_t *curr, *next;
+ int cnt = 0;
+
+ if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+ curr = list_next_entry(bookmark, entry);
+
+ list_del(&bookmark->entry);
+ bookmark->flags = 0;
+ } else
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
- list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+ if (&curr->entry == &wq_head->head)
+ return nr_exclusive;
+
+ list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
- int ret = curr->func(curr, mode, wake_flags, key);
+ int ret;
+
+ if (flags & WQ_FLAG_BOOKMARK)
+ continue;
+
+ ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
+
+ if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+ (&next->entry != &wq_head->head)) {
+ bookmark->flags = WQ_FLAG_BOOKMARK;
+ list_add_tail(&bookmark->entry, &next->entry);
+ break;
+ }
+ }
+ return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ unsigned long flags;
+ wait_queue_entry_t bookmark;
+
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
-
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
@@ -104,16 +155,23 @@ EXPORT_SYMBOL(__wake_up);
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key);
+ __wake_up_common(wq_head, mode, 1, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+ __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -134,7 +192,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
int wake_flags = 1; /* XXX WF_SYNC */
if (unlikely(!wq_head))
@@ -143,9 +200,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 98b59b5db90b..bb3a38005b9c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -17,11 +17,13 @@
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
+#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
+#include <linux/sysctl.h>
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
@@ -42,6 +44,7 @@
* get/put helpers should be used when accessing an instance
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
*
@@ -57,6 +60,7 @@
*/
struct seccomp_filter {
refcount_t usage;
+ bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
@@ -171,10 +175,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
+ * @match: stores struct seccomp_filter that resulted in the return value,
+ * unless filter returned SECCOMP_RET_ALLOW, in which case it will
+ * be unchanged.
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(const struct seccomp_data *sd)
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
+static u32 seccomp_run_filters(const struct seccomp_data *sd,
+ struct seccomp_filter **match)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
@@ -184,7 +193,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
- return SECCOMP_RET_KILL;
+ return SECCOMP_RET_KILL_PROCESS;
if (!sd) {
populate_seccomp_data(&sd_local);
@@ -198,8 +207,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
- if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
+ *match = f;
+ }
}
return ret;
}
@@ -444,6 +455,10 @@ static long seccomp_attach_filter(unsigned int flags,
return ret;
}
+ /* Set log flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_LOG)
+ filter->log = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -458,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
+void __get_seccomp_filter(struct seccomp_filter *filter)
+{
+ /* Reference count is bounded by the number of total processes. */
+ refcount_inc(&filter->usage);
+}
+
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
if (!orig)
return;
- /* Reference count is bounded by the number of total processes. */
- refcount_inc(&orig->usage);
+ __get_seccomp_filter(orig);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -476,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
}
}
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
+static void __put_seccomp_filter(struct seccomp_filter *orig)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
@@ -488,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ __put_seccomp_filter(tsk->seccomp.filter);
+}
+
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
memset(info, 0, sizeof(*info));
@@ -514,6 +538,65 @@ static void seccomp_send_sigsys(int syscall, int reason)
}
#endif /* CONFIG_SECCOMP_FILTER */
+/* For use with seccomp_actions_logged */
+#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
+#define SECCOMP_LOG_KILL_THREAD (1 << 1)
+#define SECCOMP_LOG_TRAP (1 << 2)
+#define SECCOMP_LOG_ERRNO (1 << 3)
+#define SECCOMP_LOG_TRACE (1 << 4)
+#define SECCOMP_LOG_LOG (1 << 5)
+#define SECCOMP_LOG_ALLOW (1 << 6)
+
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
+ SECCOMP_LOG_KILL_THREAD |
+ SECCOMP_LOG_TRAP |
+ SECCOMP_LOG_ERRNO |
+ SECCOMP_LOG_TRACE |
+ SECCOMP_LOG_LOG;
+
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
+ bool requested)
+{
+ bool log = false;
+
+ switch (action) {
+ case SECCOMP_RET_ALLOW:
+ break;
+ case SECCOMP_RET_TRAP:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
+ break;
+ case SECCOMP_RET_ERRNO:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
+ break;
+ case SECCOMP_RET_TRACE:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
+ break;
+ case SECCOMP_RET_LOG:
+ log = seccomp_actions_logged & SECCOMP_LOG_LOG;
+ break;
+ case SECCOMP_RET_KILL_THREAD:
+ log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
+ break;
+ case SECCOMP_RET_KILL_PROCESS:
+ default:
+ log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
+ }
+
+ /*
+ * Force an audit message to be emitted when the action is RET_KILL_*,
+ * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
+ * allowed to be logged by the admin.
+ */
+ if (log)
+ return __audit_seccomp(syscall, signr, action);
+
+ /*
+ * Let the audit subsystem decide if the action should be audited based
+ * on whether the current task itself is being audited.
+ */
+ return audit_seccomp(syscall, signr, action);
+}
+
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
* To be fully secure this must be combined with rlimit
@@ -539,7 +622,7 @@ static void __secure_computing_strict(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
@@ -566,6 +649,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
+ struct seccomp_filter *match = NULL;
int data;
/*
@@ -574,9 +658,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
rmb();
- filter_ret = seccomp_run_filters(sd);
+ filter_ret = seccomp_run_filters(sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
- action = filter_ret & SECCOMP_RET_ACTION;
+ action = filter_ret & SECCOMP_RET_ACTION_FULL;
switch (action) {
case SECCOMP_RET_ERRNO:
@@ -637,14 +721,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
+ case SECCOMP_RET_LOG:
+ seccomp_log(this_syscall, 0, action, true);
+ return 0;
+
case SECCOMP_RET_ALLOW:
+ /*
+ * Note that the "match" filter will always be NULL for
+ * this action since SECCOMP_RET_ALLOW is the starting
+ * state in seccomp_run_filters().
+ */
return 0;
- case SECCOMP_RET_KILL:
+ case SECCOMP_RET_KILL_THREAD:
+ case SECCOMP_RET_KILL_PROCESS:
default:
- audit_seccomp(this_syscall, SIGSYS, action);
+ seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
- if (get_nr_threads(current) == 1) {
+ if (action == SECCOMP_RET_KILL_PROCESS ||
+ get_nr_threads(current) == 1) {
siginfo_t info;
/* Show the original registers in the dump. */
@@ -653,13 +748,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
seccomp_init_siginfo(&info, this_syscall, data);
do_coredump(&info);
}
- do_exit(SIGSYS);
+ if (action == SECCOMP_RET_KILL_PROCESS)
+ do_group_exit(SIGSYS);
+ else
+ do_exit(SIGSYS);
}
unreachable();
skip:
- audit_seccomp(this_syscall, 0, action);
+ seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}
#else
@@ -794,6 +892,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
}
#endif
+static long seccomp_get_action_avail(const char __user *uaction)
+{
+ u32 action;
+
+ if (copy_from_user(&action, uaction, sizeof(action)))
+ return -EFAULT;
+
+ switch (action) {
+ case SECCOMP_RET_KILL_PROCESS:
+ case SECCOMP_RET_KILL_THREAD:
+ case SECCOMP_RET_TRAP:
+ case SECCOMP_RET_ERRNO:
+ case SECCOMP_RET_TRACE:
+ case SECCOMP_RET_LOG:
+ case SECCOMP_RET_ALLOW:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs)
@@ -805,6 +926,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
return seccomp_set_mode_filter(flags, uargs);
+ case SECCOMP_GET_ACTION_AVAIL:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_get_action_avail(uargs);
default:
return -EINVAL;
}
@@ -908,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
if (!data)
goto out;
- get_seccomp_filter(task);
+ __get_seccomp_filter(filter);
spin_unlock_irq(&task->sighand->siglock);
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
- put_seccomp_filter(task);
+ __put_seccomp_filter(filter);
return ret;
out:
@@ -922,3 +1048,185 @@ out:
return ret;
}
#endif
+
+#ifdef CONFIG_SYSCTL
+
+/* Human readable action names for friendly sysctl interaction */
+#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
+#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
+#define SECCOMP_RET_TRAP_NAME "trap"
+#define SECCOMP_RET_ERRNO_NAME "errno"
+#define SECCOMP_RET_TRACE_NAME "trace"
+#define SECCOMP_RET_LOG_NAME "log"
+#define SECCOMP_RET_ALLOW_NAME "allow"
+
+static const char seccomp_actions_avail[] =
+ SECCOMP_RET_KILL_PROCESS_NAME " "
+ SECCOMP_RET_KILL_THREAD_NAME " "
+ SECCOMP_RET_TRAP_NAME " "
+ SECCOMP_RET_ERRNO_NAME " "
+ SECCOMP_RET_TRACE_NAME " "
+ SECCOMP_RET_LOG_NAME " "
+ SECCOMP_RET_ALLOW_NAME;
+
+struct seccomp_log_name {
+ u32 log;
+ const char *name;
+};
+
+static const struct seccomp_log_name seccomp_log_names[] = {
+ { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
+ { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
+ { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
+ { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+ { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+ { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
+ { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
+ { }
+};
+
+static bool seccomp_names_from_actions_logged(char *names, size_t size,
+ u32 actions_logged)
+{
+ const struct seccomp_log_name *cur;
+ bool append_space = false;
+
+ for (cur = seccomp_log_names; cur->name && size; cur++) {
+ ssize_t ret;
+
+ if (!(actions_logged & cur->log))
+ continue;
+
+ if (append_space) {
+ ret = strscpy(names, " ", size);
+ if (ret < 0)
+ return false;
+
+ names += ret;
+ size -= ret;
+ } else
+ append_space = true;
+
+ ret = strscpy(names, cur->name, size);
+ if (ret < 0)
+ return false;
+
+ names += ret;
+ size -= ret;
+ }
+
+ return true;
+}
+
+static bool seccomp_action_logged_from_name(u32 *action_logged,
+ const char *name)
+{
+ const struct seccomp_log_name *cur;
+
+ for (cur = seccomp_log_names; cur->name; cur++) {
+ if (!strcmp(cur->name, name)) {
+ *action_logged = cur->log;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
+{
+ char *name;
+
+ *actions_logged = 0;
+ while ((name = strsep(&names, " ")) && *name) {
+ u32 action_logged = 0;
+
+ if (!seccomp_action_logged_from_name(&action_logged, name))
+ return false;
+
+ *actions_logged |= action_logged;
+ }
+
+ return true;
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ struct ctl_table table;
+ int ret;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ memset(names, 0, sizeof(names));
+
+ if (!write) {
+ if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ seccomp_actions_logged))
+ return -EINVAL;
+ }
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ ret = proc_dostring(&table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (write) {
+ u32 actions_logged;
+
+ if (!seccomp_actions_logged_from_names(&actions_logged,
+ table.data))
+ return -EINVAL;
+
+ if (actions_logged & SECCOMP_LOG_ALLOW)
+ return -EINVAL;
+
+ seccomp_actions_logged = actions_logged;
+ }
+
+ return 0;
+}
+
+static struct ctl_path seccomp_sysctl_path[] = {
+ { .procname = "kernel", },
+ { .procname = "seccomp", },
+ { }
+};
+
+static struct ctl_table seccomp_sysctl_table[] = {
+ {
+ .procname = "actions_avail",
+ .data = (void *) &seccomp_actions_avail,
+ .maxlen = sizeof(seccomp_actions_avail),
+ .mode = 0444,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "actions_logged",
+ .mode = 0644,
+ .proc_handler = seccomp_actions_logged_handler,
+ },
+ { }
+};
+
+static int __init seccomp_sysctl_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
+ if (!hdr)
+ pr_warn("seccomp: sysctl registration failed\n");
+ else
+ kmemleak_not_leak(hdr);
+
+ return 0;
+}
+
+device_initcall(seccomp_sysctl_init)
+
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/signal.c b/kernel/signal.c
index ed804a470dcd..800a18f77732 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2686,6 +2686,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
}
#endif
+enum siginfo_layout siginfo_layout(int sig, int si_code)
+{
+ enum siginfo_layout layout = SIL_KILL;
+ if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
+ static const struct {
+ unsigned char limit, layout;
+ } filter[] = {
+ [SIGILL] = { NSIGILL, SIL_FAULT },
+ [SIGFPE] = { NSIGFPE, SIL_FAULT },
+ [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
+ [SIGBUS] = { NSIGBUS, SIL_FAULT },
+ [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
+#if defined(SIGMET) && defined(NSIGEMT)
+ [SIGEMT] = { NSIGEMT, SIL_FAULT },
+#endif
+ [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
+ [SIGPOLL] = { NSIGPOLL, SIL_POLL },
+#ifdef __ARCH_SIGSYS
+ [SIGSYS] = { NSIGSYS, SIL_SYS },
+#endif
+ };
+ if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+ layout = filter[sig].layout;
+ else if (si_code <= NSIGPOLL)
+ layout = SIL_POLL;
+ } else {
+ if (si_code == SI_TIMER)
+ layout = SIL_TIMER;
+ else if (si_code == SI_SIGIO)
+ layout = SIL_POLL;
+ else if (si_code < 0)
+ layout = SIL_RT;
+ /* Tests to support buggy kernel ABIs */
+#ifdef TRAP_FIXME
+ if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
+ layout = SIL_FAULT;
+#endif
+#ifdef FPE_FIXME
+ if ((sig == SIGFPE) && (si_code == FPE_FIXME))
+ layout = SIL_FAULT;
+#endif
+ }
+ return layout;
+}
+
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
@@ -2708,22 +2753,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
- switch (from->si_code & __SI_MASK) {
- case __SI_KILL:
+ err |= __put_user(from->si_code, &to->si_code);
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_TIMER:
- err |= __put_user(from->si_tid, &to->si_tid);
- err |= __put_user(from->si_overrun, &to->si_overrun);
- err |= __put_user(from->si_ptr, &to->si_ptr);
+ case SIL_TIMER:
+ /* Unreached SI_TIMER is negative */
break;
- case __SI_POLL:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_FAULT:
+ case SIL_FAULT:
err |= __put_user(from->si_addr, &to->si_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
@@ -2748,30 +2791,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_pkey, &to->si_pkey);
#endif
break;
- case __SI_CHLD:
+ case SIL_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_status, &to->si_status);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
break;
- case __SI_RT: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ: /* But this is */
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
#ifdef __ARCH_SIGSYS
- case __SI_SYS:
+ case SIL_SYS:
err |= __put_user(from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
#endif
- default: /* this is just in case for now ... */
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
}
return err;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 81cfca9b4cc3..c94dd85c8d41 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -550,7 +550,7 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
/* Setup number of possible processor ids */
-int nr_cpu_ids __read_mostly = NR_CPUS;
+unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2855ee73acd0..9aebc2935013 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1896,15 +1896,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
/*
* Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local root should
+ * change /proc/pid/exe link: only local sys admin should
* be allowed to.
*/
if (prctl_map->exe_fd != (u32)-1) {
- struct user_namespace *ns = current_user_ns();
- const struct cred *cred = current_cred();
-
- if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
- !gid_eq(cred->gid, make_kgid(ns, 0)))
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
goto out;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..4da9e622471f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
#ifdef CONFIG_SCHEDSTATS
{
@@ -2187,8 +2188,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
if (write) {
if (*lvalp > UINT_MAX)
return -EINVAL;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 02e1859f2ca8..58ea8c03662e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -986,8 +986,9 @@ static ssize_t bin_intvec(struct file *file,
size_t length = oldlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ result = kernel_read(file, buffer, BUFSZ - 1, &pos);
if (result < 0)
goto out_kfree;
@@ -1016,6 +1017,7 @@ static ssize_t bin_intvec(struct file *file,
size_t length = newlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
str = buffer;
end = str + BUFSZ;
@@ -1029,7 +1031,7 @@ static ssize_t bin_intvec(struct file *file,
str += scnprintf(str, end - str, "%lu\t", value);
}
- result = kernel_write(file, buffer, str - buffer, 0);
+ result = kernel_write(file, buffer, str - buffer, &pos);
if (result < 0)
goto out_kfree;
}
@@ -1057,8 +1059,9 @@ static ssize_t bin_ulongvec(struct file *file,
size_t length = oldlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ result = kernel_read(file, buffer, BUFSZ - 1, &pos);
if (result < 0)
goto out_kfree;
@@ -1087,6 +1090,7 @@ static ssize_t bin_ulongvec(struct file *file,
size_t length = newlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
str = buffer;
end = str + BUFSZ;
@@ -1100,7 +1104,7 @@ static ssize_t bin_ulongvec(struct file *file,
str += scnprintf(str, end - str, "%lu\t", value);
}
- result = kernel_write(file, buffer, str - buffer, 0);
+ result = kernel_write(file, buffer, str - buffer, &pos);
if (result < 0)
goto out_kfree;
}
@@ -1120,8 +1124,9 @@ static ssize_t bin_uuid(struct file *file,
if (oldval && oldlen) {
char buf[UUID_STRING_LEN + 1];
uuid_t uuid;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
if (result < 0)
goto out;
@@ -1154,8 +1159,9 @@ static ssize_t bin_dn_node_address(struct file *file,
char buf[15], *nodep;
unsigned long area, node;
__le16 dnaddr;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
if (result < 0)
goto out;
@@ -1188,6 +1194,7 @@ static ssize_t bin_dn_node_address(struct file *file,
__le16 dnaddr;
char buf[15];
int len;
+ loff_t pos = 0;
result = -EINVAL;
if (newlen != sizeof(dnaddr))
@@ -1201,7 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file,
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
- result = kernel_write(file, buf, len, 0);
+ result = kernel_write(file, buf, len, &pos);
if (result < 0)
goto out;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ea4fb315719..2cafb49aa65e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
-#endif
+#endif /* CONFIG_NTP_PPS */
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 38bc4d2208e8..0754cadfa9e6 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/time.h>
#include "timekeeping_internal.h"
@@ -75,7 +76,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
- printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
- (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
+ pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f86100a..45a3928544ce 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
+#include <linux/blk-cgroup.h>
#include "../../block/blk.h"
@@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
+#define TRACE_BLK_OPT_CGROUP 0x2
+#define TRACE_BLK_OPT_CGNAME 0x4
static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+ { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+ { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
+#endif
{ }
};
@@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len)
+ const void *data, size_t len,
+ union kernfs_node_id *cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
@@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len,
+ sizeof(*t) + len + cgid_len,
0, pc);
if (!event)
return;
@@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
- t->action = action;
+ t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
- t->pdu_len = len;
- memcpy((void *) t + sizeof(*t), data, len);
+ t->pdu_len = len + cgid_len;
+ if (cgid)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm));
+ sizeof(tsk->comm), NULL);
}
spin_unlock_irqrestore(&running_trace_lock, flags);
}
@@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec;
local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
+ const char *fmt, ...)
{
int n;
va_list args;
@@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ blkcg = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+#else
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data)
+ void *pdu_data, union kernfs_node_id *cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
pid_t pid;
int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ if (cgid)
+ what |= __BLK_TA_CGROUP;
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len,
+ sizeof(*t) + pdu_len + cgid_len,
0, pc);
if (!event)
return;
@@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -280,10 +302,12 @@ record_it:
t->action = what;
t->device = bt->dev;
t->error = error;
- t->pdu_len = pdu_len;
+ t->pdu_len = pdu_len + cgid_len;
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
if (pdu_len)
- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, "%s", msg);
+ __trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -624,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start)
}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ */
+
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
@@ -641,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
switch (cmd) {
case BLKTRACESETUP:
@@ -667,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
break;
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
return ret;
}
@@ -684,6 +714,36 @@ void blk_trace_shutdown(struct request_queue *q)
}
}
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ return NULL;
+
+ if (!bio->bi_css)
+ return NULL;
+ return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+ if (!rq->bio)
+ return NULL;
+ /* Use the first bio */
+ return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
/*
* blktrace probes
*/
@@ -694,13 +754,15 @@ void blk_trace_shutdown(struct request_queue *q)
* @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
+ * @cgid: the cgroup info
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static void blk_add_trace_rq(struct request *rq, int error,
- unsigned int nr_bytes, u32 what)
+ unsigned int nr_bytes, u32 what,
+ union kernfs_node_id *cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
@@ -713,32 +775,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL);
+ rq->cmd_flags, what, error, 0, NULL, cgid);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+ blk_trace_request_get_cgid(rq->q, rq));
}
/**
@@ -753,7 +819,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u32 what, int error, union kernfs_node_id *cgid)
{
struct blk_trace *bt = q->blk_trace;
@@ -761,20 +827,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +850,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +859,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +875,14 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL);
+ NULL, NULL);
}
}
@@ -820,13 +892,14 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL);
+ 0, 0, NULL, NULL);
}
}
@@ -835,7 +908,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +925,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
}
@@ -868,7 +941,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu);
+ &rpdu, blk_trace_bio_get_cgid(q, bio));
}
}
@@ -896,12 +969,12 @@ static void blk_add_trace_bio_remap(void *ignore,
return;
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
}
/**
@@ -934,7 +1007,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
}
/**
@@ -958,7 +1031,8 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, 0, len, data);
+ BLK_TA_DRV_DATA, 0, len, data,
+ blk_trace_request_get_cgid(q, rq));
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1031,7 +1105,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
- if (t->action == BLK_TN_MESSAGE) {
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
rwbs[i++] = 'N';
goto out;
}
@@ -1066,9 +1140,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
return (const struct blk_io_trace *)ent;
}
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
+{
+ return (void *)(te_blk_io_trace(ent) + 1) +
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
{
- return te_blk_io_trace(ent) + 1;
+ return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
+{
+ return te_blk_io_trace(ent)->pdu_len -
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1182,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
return te_blk_io_trace(ent)->error;
}
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent);
+ const __u64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r)
+ struct blk_io_trace_remap *r, bool has_cg)
{
- const struct blk_io_trace_remap *__r = pdu_start(ent);
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
__u64 sector_from = __r->sector_from;
r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1199,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+ bool has_cg);
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1131,24 +1219,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+ if (has_cg) {
+ const union kernfs_node_id *id = cgid_start(iter->ent);
+
+ if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
+ char blkcg_name_buf[NAME_MAX + 1] = "<...>";
+
+ cgroup_path_from_kernfs_id(id, blkcg_name_buf,
+ sizeof(blkcg_name_buf));
+ trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ blkcg_name_buf, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %x,%-x %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ id->ino, id->generation, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static void blk_log_dump_pdu(struct trace_seq *s,
+ const struct trace_entry *ent, bool has_cg)
{
const unsigned char *pdu_buf;
int pdu_len;
int i, end;
- pdu_buf = pdu_start(ent);
- pdu_len = te_blk_io_trace(ent)->pdu_len;
+ pdu_buf = pdu_start(ent, has_cg);
+ pdu_len = pdu_real_len(ent, has_cg);
if (!pdu_len)
return;
@@ -1179,7 +1286,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_puts(s, ") ");
}
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1187,7 +1294,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
trace_seq_printf(s, "%u ", t_bytes(ent));
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
@@ -1199,10 +1306,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
}
static void blk_log_with_error(struct trace_seq *s,
- const struct trace_entry *ent)
+ const struct trace_entry *ent, bool has_cg)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
@@ -1215,18 +1322,18 @@ static void blk_log_with_error(struct trace_seq *s,
}
}
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
struct blk_io_trace_remap r = { .device_from = 0, };
- get_pdu_remap(ent, &r);
+ get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
MAJOR(r.device_from), MINOR(r.device_from),
(unsigned long long)r.sector_from);
}
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1235,30 +1342,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_printf(s, "[%s]\n", cmd);
}
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ get_pdu_int(ent, has_cg), cmd);
}
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg)
{
- const struct blk_io_trace *t = te_blk_io_trace(ent);
- trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putmem(s, pdu_start(ent, has_cg),
+ pdu_real_len(ent, has_cg));
trace_seq_putc(s, '\n');
}
@@ -1298,7 +1406,8 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- void (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1326,23 +1435,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
u16 what;
bool long_act;
blk_log_action_t *log_action;
+ bool has_cg;
t = te_blk_io_trace(iter->ent);
- what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
+ has_cg = t->action & __BLK_TA_CGROUP;
- if (t->action == BLK_TN_MESSAGE) {
- log_action(iter, long_act ? "message" : "m");
- blk_log_msg(s, iter->ent);
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+ log_action(iter, long_act ? "message" : "m", has_cg);
+ blk_log_msg(s, iter->ent, has_cg);
return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
trace_seq_printf(s, "Unknown action %x\n", what);
else {
- log_action(iter, what2act[what].act[long_act]);
- what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act], has_cg);
+ what2act[what].print(s, iter->ent, has_cg);
}
return trace_handle_return(s);
@@ -1622,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1641,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
@@ -1683,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
if (value)
@@ -1709,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 96cea88fa00f..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are per_cpu ops, they still need their
- * per_cpu field freed. Since, function tracing is
+ * If these are dynamic or per_cpu ops, they still
+ * need their data freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ goto free_ops;
+
return 0;
}
@@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (IS_ENABLED(CONFIG_PREEMPT))
synchronize_rcu_tasks();
+ free_ops:
arch_ftrace_trampoline_free(ops);
if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -4952,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -5690,10 +5689,51 @@ static int referenced_filters(struct dyn_ftrace *rec)
return cnt;
}
+static void
+clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int i;
+
+ if (ftrace_hash_empty(hash))
+ return;
+
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+ entry = __ftrace_lookup_ip(hash, rec->ip);
+ /*
+ * Do not allow this rec to match again.
+ * Yeah, it may waste some memory, but will be removed
+ * if/when the hash is modified again.
+ */
+ if (entry)
+ entry->ip = 0;
+ }
+}
+
+/* Clear any records from hashs */
+static void clear_mod_from_hashes(struct ftrace_page *pg)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->ops || !tr->ops->func_hash)
+ continue;
+ mutex_lock(&tr->ops->func_hash->regex_lock);
+ clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash);
+ clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash);
+ mutex_unlock(&tr->ops->func_hash->regex_lock);
+ }
+ mutex_unlock(&trace_types_lock);
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
struct ftrace_page **last_pg;
+ struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
int order;
@@ -5723,14 +5763,25 @@ void ftrace_release_mod(struct module *mod)
ftrace_update_tot_cnt -= pg->index;
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- kfree(pg);
+
+ pg->next = tmp_page;
+ tmp_page = pg;
} else
last_pg = &pg->next;
}
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ for (pg = tmp_page; pg; pg = tmp_page) {
+
+ /* Needs to be called outside of ftrace_lock */
+ clear_mod_from_hashes(pg);
+
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ tmp_page = pg->next;
+ kfree(pg);
+ }
}
void ftrace_module_enable(struct module *mod)
@@ -6754,17 +6805,6 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * Function graph does not allocate the trampoline, but
- * other global_ops do. We need to reset the ALLOC_TRAMP flag
- * if one was used.
- */
- global_ops.trampoline = save_global_trampoline;
- if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
- global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44004d8aa3b3..752e5daf0896 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void)
struct trace_array *tr;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->clear_trace)
+ continue;
+ tr->clear_trace = false;
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
tracing_reset_online_cpus(&tr->max_buffer);
@@ -2799,11 +2802,17 @@ static char *get_trace_buf(void)
if (!buffer || buffer->nesting >= 4)
return NULL;
- return &buffer->buffer[buffer->nesting++][0];
+ buffer->nesting++;
+
+ /* Interrupts must see nesting incremented before we use the buffer */
+ barrier();
+ return &buffer->buffer[buffer->nesting][0];
}
static void put_trace_buf(void)
{
+ /* Don't let the decrement of nesting leak before this */
+ barrier();
this_cpu_dec(trace_percpu_buffer->nesting);
}
@@ -4011,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->current_trace->print_max)
+ trace_buf = &tr->max_buffer;
+#endif
if (cpu == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(&tr->trace_buffer, cpu);
+ tracing_reset(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -5349,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t == tr->current_trace)
goto out;
+ /* Some tracers won't work on kernel command line */
+ if (system_state < SYSTEM_RUNNING && t->noboot) {
+ pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
+ t->name);
+ goto out;
+ }
+
/* Some tracers are only allowed for the top level buffer */
if (!trace_ok_for_array(t, tr)) {
ret = -EINVAL;
@@ -5658,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracing_is_on() && iter->pos)
+ if (!tracer_tracing_is_on(iter->tr) && iter->pos)
break;
mutex_unlock(&iter->mutex);
@@ -6220,7 +6242,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ if (tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
tracing_reset_online_cpus(&tr->max_buffer);
#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 490ba229931d..652c682707cd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -245,6 +245,7 @@ struct trace_array {
int stop_count;
int clock_id;
int nr_topts;
+ bool clear_trace;
struct tracer *current_trace;
unsigned int trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
@@ -443,6 +444,8 @@ struct tracer {
#ifdef CONFIG_TRACER_MAX_TRACE
bool use_max_tr;
#endif
+ /* True if tracer cannot be enabled in kernel param */
+ bool noboot;
};
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 36132f9280e6..87468398b9ed 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
tracing_stop_tgid_record();
- clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
call->class->reg(call, TRACE_REG_UNREGISTER, file);
@@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
/* WAS_ENABLED gets set but never cleared. */
- call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
+ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags);
}
break;
}
@@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call)
do_for_each_event_file(tr, file) {
if (file->event_call != call)
continue;
+
+ if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+ tr->clear_trace = true;
+
ftrace_event_enable_disable(file, 0);
/*
* The do_for_each_event_file() is
@@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
- bool clear_trace = false;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod) {
- if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
- clear_trace = true;
+ if (call->mod == mod)
__trace_remove_event_call(call);
- }
}
up_write(&trace_event_sem);
@@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- if (clear_trace)
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus();
}
static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 181e139a8057..61e7f0678d33 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps,
int pos = ps->lasterr_pos;
char *buf, *pbuf;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
+ buf = (char *)__get_free_page(GFP_KERNEL);
if (!buf)
return;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d56123cdcc89..b8f1f54731af 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
- max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+ max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
if (!register_trace_event(&graph_trace_entry_event)) {
pr_warn("Warning: could not register graph trace events\n");
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index cd7480d0a201..dca78fc48439 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly =
.close = mmio_close,
.read = mmio_read,
.print_line = mmio_print_line,
+ .noboot = true,
};
__init static int init_mmio_trace(void)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..0c331978b1a6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
+ entry->prev_state = __get_task_state(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
- entry->next_state = next->state;
+ entry->next_state = __get_task_state(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
+ entry->prev_state = __get_task_state(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
+ entry->next_state = __get_task_state(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cb917cebae29..b17ec642793b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -273,7 +273,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
goto out_free;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
- goto out;
+ goto out_free;
}
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a4df67cbc711..49cb41412eec 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack)
if (in_nmi())
return;
- /*
- * There's a slight chance that we are tracing inside the
- * RCU infrastructure, and rcu_irq_enter() will not work
- * as expected.
- */
- if (unlikely(rcu_irq_enter_disabled()))
- return;
-
local_irq_save(flags);
arch_spin_lock(&stack_trace_max_lock);
- /*
- * RCU may not be watching, make it see us.
- * The stack trace code uses rcu_sched.
- */
- rcu_irq_enter();
-
/* In case another CPU set the tracer_frame on us */
if (unlikely(!frame_size))
this_size -= tracer_frame;
@@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack)
}
out:
- rcu_irq_exit();
arch_spin_unlock(&stack_trace_max_lock);
local_irq_restore(flags);
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 74d9a86eccc0..696afe72d3b1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long args[SYSCALL_DEFINE_MAXARGS];
+ } param;
+ int i;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ for (i = 0; i < sys_data->nb_args; i++)
+ param.args[i] = rec->args[i];
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
+
+ if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL, NULL);
@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_trace_exit *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long ret;
+ } param;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ param.ret = rec->ret;
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
+
+ if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL, NULL);
}
diff --git a/kernel/umh.c b/kernel/umh.c
new file mode 100644
index 000000000000..6ff9905250ff
--- /dev/null
+++ b/kernel/umh.c
@@ -0,0 +1,568 @@
+/*
+ * umh - the kernel usermode helper
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/module.h>
+
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
+
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int call_usermodehelper_exec_async(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ struct cred *new;
+ int retval;
+
+ spin_lock_irq(&current->sighand->siglock);
+ flush_signal_handlers(current, 1);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto out;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info, new);
+ if (retval) {
+ abort_creds(new);
+ goto out;
+ }
+ }
+
+ commit_creds(new);
+
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
+ if (!retval)
+ return 0;
+ do_exit(0);
+}
+
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
+{
+ pid_t pid;
+
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ kernel_sigaction(SIGCHLD, SIG_DFL);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ int ret = -ECHILD;
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
+ }
+
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
+ umh_complete(sub_info);
+}
+
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
+{
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
+
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
+ /*
+ * Use CLONE_PARENT to reparent it to kthreadd; we do not
+ * want to pollute current->children, and we need a parent
+ * that always ignores SIGCHLD to ensure auto-reaping.
+ */
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
+ }
+}
+
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
+ */
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_disable() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_disable() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+
+int usermodehelper_read_trylock(void)
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
+
+void usermodehelper_read_unlock(void)
+{
+ up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
+
+/**
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
+ */
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
+{
+ long retval;
+
+ if (!depth)
+ return -EINVAL;
+
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ up_write(&umhelper_sem);
+
+ /*
+ * From now on call_usermodehelper_exec() won't start any new
+ * helpers, so it is sufficient if running_helpers turns out to
+ * be zero at one point (it may be increased later, but that
+ * doesn't matter).
+ */
+ retval = wait_event_timeout(running_helpers_waitq,
+ atomic_read(&running_helpers) == 0,
+ RUNNING_HELPERS_TIMEOUT);
+ if (retval)
+ return 0;
+
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ return -EAGAIN;
+}
+
+static void helper_lock(void)
+{
+ atomic_inc(&running_helpers);
+ smp_mb__after_atomic();
+}
+
+static void helper_unlock(void)
+{
+ if (atomic_dec_and_test(&running_helpers))
+ wake_up(&running_helpers_waitq);
+}
+
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
+ *
+ * Returns either %NULL on allocation failure, or a subprocess_info
+ * structure. This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
+{
+ struct subprocess_info *sub_info;
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
+ if (!sub_info)
+ goto out;
+
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+ sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
+ sub_info->path = path;
+#endif
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
+ out:
+ return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
+ */
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ int retval = 0;
+
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
+ helper_lock();
+ if (usermodehelper_disabled) {
+ retval = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * If there is no binary for us to call, then just return and get out of
+ * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+ * disable all call_usermodehelper() calls.
+ */
+ if (strlen(sub_info->path) == 0)
+ goto out;
+
+ /*
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
+ */
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
+ sub_info->wait = wait;
+
+ queue_work(system_unbound_wq, &sub_info->work);
+ if (wait == UMH_NO_WAIT) /* task has freed sub_info */
+ goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
+ wait_for_completion(&done);
+wait_done:
+ retval = sub_info->retval;
+out:
+ call_usermodehelper_freeinfo(sub_info);
+unlock:
+ helper_unlock();
+ return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_exec);
+
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
+ */
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
+ if (info == NULL)
+ return -ENOMEM;
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2f735cbe05e8..c490f1e4313b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns)
}
/*
- * Returns true if @ns is the same namespace as or a descendant of
- * @target_ns.
+ * Returns true if @child is the same namespace or a descendant of
+ * @ancestor.
*/
+bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child)
+{
+ const struct user_namespace *ns;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return (ns == ancestor);
+}
+
bool current_in_userns(const struct user_namespace *target_ns)
{
- struct user_namespace *ns;
- for (ns = current_user_ns(); ns; ns = ns->parent) {
- if (ns == target_ns)
- return true;
- }
- return false;
+ return in_userns(target_ns, current_user_ns());
}
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ab3c0dc8c7ed..64d0edf428f8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -21,7 +21,7 @@
* pools for workqueues which are not bound to any specific CPU - the
* number of these backing pools is dynamic.
*
- * Please read Documentation/workqueue.txt for details.
+ * Please read Documentation/core-api/workqueue.rst for details.
*/
#include <linux/export.h>
@@ -2269,7 +2269,7 @@ sleep:
* event.
*/
worker_enter_idle(worker);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
@@ -2311,7 +2311,7 @@ static int rescuer_thread(void *__rescuer)
*/
rescuer->task->flags |= PF_WQ_WORKER;
repeat:
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_IDLE);
/*
* By the time the rescuer is requested to stop, the workqueue
OpenPOWER on IntegriCloud