From 6283fa38dc8744dc7c2bd2a03bb0478fe42f79fa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 20 Jul 2018 17:38:37 -0700 Subject: bpf: btf: Ensure the member->offset is in the right order This patch ensures the member->offset of a struct is in the correct order (i.e the later member's offset cannot go backward). The current "pahole -J" BTF encoder does not generate something like this. However, checking this can ensure future encoder will not violate this. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9704934252b3..2590700237c1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1519,9 +1519,9 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; + u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; - u32 meta_needed; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1534,6 +1534,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); + last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, @@ -1555,6 +1556,16 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * ">" instead of ">=" because the last member could be + * "char a[0];" + */ + if (last_offset > member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, "Memmber bits_offset exceeds its struct size"); @@ -1562,6 +1573,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } btf_verifier_log_member(env, t, member, NULL); + last_offset = member->offset; } return meta_needed; -- cgit v1.2.1 From 62cedf3e60af03e47849fe2bd6a03ec179422a8a Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Fri, 20 Jul 2018 10:39:13 +0200 Subject: locking/rtmutex: Allow specifying a subclass for nested locking Needed for annotating rt_mutex locks. Tested-by: John Sperbeck Signed-off-by: Peter Rosin Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Deepa Dinamani Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Chang Cc: Peter Zijlstra Cc: Philippe Ombredanne Cc: Thomas Gleixner Cc: Will Deacon Cc: Wolfram Sang Link: http://lkml.kernel.org/r/20180720083914.1950-2-peda@axentia.se Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4f014be7a4b8..2823d4163a37 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1465,6 +1465,29 @@ rt_mutex_fastunlock(struct rt_mutex *lock, rt_mutex_postunlock(&wake_q); } +static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +{ + might_sleep(); + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock(lock, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +#endif + +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * rt_mutex_lock - lock a rt_mutex * @@ -1472,12 +1495,10 @@ rt_mutex_fastunlock(struct rt_mutex *lock, */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); + __rt_mutex_lock(lock, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible -- cgit v1.2.1 From 6cd0c583b04b2bd9415e07b51b63ab799949dd66 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 23 Jul 2018 12:19:07 +0800 Subject: sched/topology: Check variable group before dereferencing it The 'group' variable in sched_domain_debug_one() is not checked when firstly used in cpumask_test_cpu(cpu, sched_group_span(group)), but it might be NULL (it is checked later in the following while loop) and may cause NULL pointer dereference. We need to check it before using to avoid NULL dereference. Signed-off-by: Yi Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1532319547-33335-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a831427bc7..56a0fed30c0a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -47,7 +47,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_span(group))) { + if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } -- cgit v1.2.1 From 2610e88946632afb78aa58e61f11368ac4c0af7b Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Tue, 17 Jul 2018 12:35:29 -0700 Subject: stop_machine: Disable preemption after queueing stopper threads This commit: 9fb8d5dc4b64 ("stop_machine, Disable preemption when waking two stopper threads") does not fully address the race condition that can occur as follows: On one CPU, call it CPU 3, thread 1 invokes cpu_stop_queue_two_works(2, 3,...), and the execution is such that thread 1 queues the works for migration/2 and migration/3, and is preempted after releasing the locks for migration/2 and migration/3, but before waking the threads. Then, On CPU 2, a kworker, call it thread 2, is running, and it invokes cpu_stop_queue_two_works(1, 2,...), such that thread 2 queues the works for migration/1 and migration/2. Meanwhile, on CPU 3, thread 1 resumes execution, and wakes migration/2 and migration/3. This means that when CPU 2 releases the locks for migration/1 and migration/2, but before it wakes those threads, it can be preempted by migration/2. If thread 2 is preempted by migration/2, then migration/2 will execute the first work item successfully, since migration/3 was woken up by CPU 3, but when it goes to execute the second work item, it disables preemption, calls multi_cpu_stop(), and thus, CPU 2 will wait forever for migration/1, which should have been woken up by thread 2. However migration/1 cannot be woken up by thread 2, since it is a kworker, so it is affine to CPU 2, but CPU 2 is running migration/2 with preemption disabled, so thread 2 will never run. Disable preemption after queueing works for stopper threads to ensure that the operation of queueing the works and waking the stopper threads is atomic. Co-Developed-by: Prasad Sodagudi Co-Developed-by: Pavankumar Kondeti Signed-off-by: Isaac J. Manjarres Signed-off-by: Prasad Sodagudi Signed-off-by: Pavankumar Kondeti Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: gregkh@linuxfoundation.org Cc: matt@codeblueprint.co.uk Fixes: 9fb8d5dc4b64 ("stop_machine, Disable preemption when waking two stopper threads") Link: http://lkml.kernel.org/r/1531856129-9871-1-git-send-email-isaacm@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1ff523dae6e2..e190d1ef3a23 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -260,6 +260,15 @@ retry: err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); + /* + * The waking up of stopper threads has to happen + * in the same scheduling context as the queueing. + * Otherwise, there is a possibility of one of the + * above stoppers being woken up by another CPU, + * and preempting us. This will cause us to n ot + * wake up the other stopper forever. + */ + preempt_disable(); unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); @@ -271,7 +280,6 @@ unlock: } if (!err) { - preempt_disable(); wake_up_q(&wakeq); preempt_enable(); } -- cgit v1.2.1 From 840d719604b0925ca23dde95f1767e4528668369 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 20 Jul 2018 11:16:30 +0200 Subject: sched/deadline: Update rq_clock of later_rq when pushing a task Daniel Casini got this warn while running a DL task here at RetisLab: [ 461.137582] ------------[ cut here ]------------ [ 461.137583] rq->clock_update_flags < RQCF_ACT_SKIP [ 461.137599] WARNING: CPU: 4 PID: 2354 at kernel/sched/sched.h:967 assert_clock_updated.isra.32.part.33+0x17/0x20 [a ton of modules] [ 461.137646] CPU: 4 PID: 2354 Comm: label_image Not tainted 4.18.0-rc4+ #3 [ 461.137647] Hardware name: ASUS All Series/Z87-K, BIOS 0801 09/02/2013 [ 461.137649] RIP: 0010:assert_clock_updated.isra.32.part.33+0x17/0x20 [ 461.137649] Code: ff 48 89 83 08 09 00 00 eb c6 66 0f 1f 84 00 00 00 00 00 55 48 c7 c7 98 7a 6c a5 c6 05 bc 0d 54 01 01 48 89 e5 e8 a9 84 fb ff <0f> 0b 5d c3 0f 1f 44 00 00 0f 1f 44 00 00 83 7e 60 01 74 0a 48 3b [ 461.137673] RSP: 0018:ffffa77e08cafc68 EFLAGS: 00010082 [ 461.137674] RAX: 0000000000000000 RBX: ffff8b3fc1702d80 RCX: 0000000000000006 [ 461.137674] RDX: 0000000000000007 RSI: 0000000000000096 RDI: ffff8b3fded164b0 [ 461.137675] RBP: ffffa77e08cafc68 R08: 0000000000000026 R09: 0000000000000339 [ 461.137676] R10: ffff8b3fd060d410 R11: 0000000000000026 R12: ffffffffa4e14e20 [ 461.137677] R13: ffff8b3fdec22940 R14: ffff8b3fc1702da0 R15: ffff8b3fdec22940 [ 461.137678] FS: 00007efe43ee5700(0000) GS:ffff8b3fded00000(0000) knlGS:0000000000000000 [ 461.137679] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 461.137680] CR2: 00007efe30000010 CR3: 0000000301744003 CR4: 00000000001606e0 [ 461.137680] Call Trace: [ 461.137684] push_dl_task.part.46+0x3bc/0x460 [ 461.137686] task_woken_dl+0x60/0x80 [ 461.137689] ttwu_do_wakeup+0x4f/0x150 [ 461.137690] ttwu_do_activate+0x77/0x80 [ 461.137692] try_to_wake_up+0x1d6/0x4c0 [ 461.137693] wake_up_q+0x32/0x70 [ 461.137696] do_futex+0x7e7/0xb50 [ 461.137698] __x64_sys_futex+0x8b/0x180 [ 461.137701] do_syscall_64+0x5a/0x110 [ 461.137703] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 461.137705] RIP: 0033:0x7efe4918ca26 [ 461.137705] Code: 00 00 00 74 17 49 8b 48 20 44 8b 59 10 41 83 e3 30 41 83 fb 20 74 1e be 85 00 00 00 41 ba 01 00 00 00 41 b9 01 00 00 04 0f 05 <48> 3d 01 f0 ff ff 73 1f 31 c0 c3 be 8c 00 00 00 49 89 c8 4d 31 d2 [ 461.137738] RSP: 002b:00007efe43ee4928 EFLAGS: 00000283 ORIG_RAX: 00000000000000ca [ 461.137739] RAX: ffffffffffffffda RBX: 0000000005094df0 RCX: 00007efe4918ca26 [ 461.137740] RDX: 0000000000000001 RSI: 0000000000000085 RDI: 0000000005094e24 [ 461.137741] RBP: 00007efe43ee49c0 R08: 0000000005094e20 R09: 0000000004000001 [ 461.137741] R10: 0000000000000001 R11: 0000000000000283 R12: 0000000000000000 [ 461.137742] R13: 0000000005094df8 R14: 0000000000000001 R15: 0000000000448a10 [ 461.137743] ---[ end trace 187df4cad2bf7649 ]--- This warning happened in the push_dl_task(), because __add_running_bw()->cpufreq_update_util() is getting the rq_clock of the later_rq before its update, which takes place at activate_task(). The fix then is to update the rq_clock before calling add_running_bw(). To avoid double rq_clock_update() call, we set ENQUEUE_NOCLOCK flag to activate_task(). Reported-by: Daniel Casini Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Clark Williams Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Fixes: e0367b12674b sched/deadline: Move CPU frequency selection triggering points Link: http://lkml.kernel.org/r/ca31d073a4788acf0684a8b255f14fea775ccf20.1532077269.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 10c7b51c0d1f..b5fbdde6afa9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2090,8 +2090,14 @@ retry: sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); add_rq_bw(&next_task->dl, &later_rq->dl); + + /* + * Update the later_rq clock here, because the clock is used + * by the cpufreq_update_util() inside __add_running_bw(). + */ + update_rq_clock(later_rq); add_running_bw(&next_task->dl, &later_rq->dl); - activate_task(later_rq, next_task, 0); + activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; resched_curr(later_rq); -- cgit v1.2.1 From f3d133ee0a17d5694c6f21873eec9863e11fa423 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Wed, 18 Jul 2018 08:46:55 +0800 Subject: sched/rt: Restore rt_runtime after disabling RT_RUNTIME_SHARE NO_RT_RUNTIME_SHARE feature is used to prevent a CPU borrow enough runtime with a spin-rt-task. However, if RT_RUNTIME_SHARE feature is enabled and rt_rq has borrowd enough rt_runtime at the beginning, rt_runtime can't be restored to its initial bandwidth rt_runtime after we disable RT_RUNTIME_SHARE. E.g. on my PC with 4 cores, procedure to reproduce: 1) Make sure RT_RUNTIME_SHARE is enabled cat /sys/kernel/debug/sched_features GENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION NO_HRTICK NO_DOUBLE_TICK LB_BIAS NONTASK_CAPACITY TTWU_QUEUE NO_SIS_AVG_CPU SIS_PROP NO_WARN_DOUBLE_CLOCK RT_PUSH_IPI RT_RUNTIME_SHARE NO_LB_MIN ATTACH_AGE_LOAD WA_IDLE WA_WEIGHT WA_BIAS 2) Start a spin-rt-task ./loop_rr & 3) set affinity to the last cpu taskset -p 8 $pid_of_loop_rr 4) Observe that last cpu have borrowed enough runtime. cat /proc/sched_debug | grep rt_runtime .rt_runtime : 950.000000 .rt_runtime : 900.000000 .rt_runtime : 950.000000 .rt_runtime : 1000.000000 5) Disable RT_RUNTIME_SHARE echo NO_RT_RUNTIME_SHARE > /sys/kernel/debug/sched_features 6) Observe that rt_runtime can not been restored cat /proc/sched_debug | grep rt_runtime .rt_runtime : 950.000000 .rt_runtime : 900.000000 .rt_runtime : 950.000000 .rt_runtime : 1000.000000 This patch help to restore rt_runtime after we disable RT_RUNTIME_SHARE. Signed-off-by: Hailong Liu Signed-off-by: Jiang Biao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1531874815-39357-1-git-send-email-liu.hailong6@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 572567078b60..eaaec8364f96 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -836,6 +836,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * can be time-consuming. Try to avoid it when possible. */ raw_spin_lock(&rt_rq->rt_runtime_lock); + if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) + rt_rq->rt_runtime = rt_b->rt_runtime; skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; raw_spin_unlock(&rt_rq->rt_runtime_lock); if (skip) -- cgit v1.2.1 From 6cbc304f2f360f25cc8607817239d6f4a2fd3dc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 May 2018 15:48:41 +0200 Subject: perf/x86/intel: Fix unwind errors from PEBS entries (mk-II) Vince reported the perf_fuzzer giving various unwinder warnings and Josh reported: > Deja vu. Most of these are related to perf PEBS, similar to the > following issue: > > b8000586c90b ("perf/x86/intel: Cure bogus unwind from PEBS entries") > > This is basically the ORC version of that. setup_pebs_sample_data() is > assembling a franken-pt_regs which ORC isn't happy about. RIP is > inconsistent with some of the other registers (like RSP and RBP). And where the previous unwinder only needed BP,SP ORC also requires IP. But we cannot spoof IP because then the sample will get displaced, entirely negating the point of PEBS. So cure the whole thing differently by doing the unwind early; this does however require a means to communicate we did the unwind early. We (ab)use an unused sample_type bit for this, which we set on events that fill out the data->callchain before the normal perf_prepare_sample(). Debugged-by: Josh Poimboeuf Reported-by: Vince Weaver Tested-by: Josh Poimboeuf Tested-by: Prashant Bhole Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..cdb32cf8e33c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6343,7 +6343,7 @@ static u64 perf_virt_to_phys(u64 virt) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; -static struct perf_callchain_entry * +struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; @@ -6382,7 +6382,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(event, regs); + if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + data->callchain = perf_callchain(event, regs); + size += data->callchain->nr; header->size += size * sizeof(u64); -- cgit v1.2.1 From 7f635ff187ab6be0b350b3ec06791e376af238ab Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 16 Jul 2018 17:13:51 -0600 Subject: perf/core: Fix crash when using HW tracing kernel filters In function perf_event_parse_addr_filter(), the path::dentry of each struct perf_addr_filter is left unassigned (as it should be) when the pattern being parsed is related to kernel space. But in function perf_addr_filter_match() the same dentries are given to d_inode() where the value is not expected to be NULL, resulting in the following splat: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 pc : perf_event_mmap+0x2fc/0x5a0 lr : perf_event_mmap+0x2c8/0x5a0 Process uname (pid: 2860, stack limit = 0x000000001cbcca37) Call trace: perf_event_mmap+0x2fc/0x5a0 mmap_region+0x124/0x570 do_mmap+0x344/0x4f8 vm_mmap_pgoff+0xe4/0x110 vm_mmap+0x2c/0x40 elf_map+0x60/0x108 load_elf_binary+0x450/0x12c4 search_binary_handler+0x90/0x290 __do_execve_file.isra.13+0x6e4/0x858 sys_execve+0x3c/0x50 el0_svc_naked+0x30/0x34 This patch is fixing the problem by introducing a new check in function perf_addr_filter_match() to see if the filter's dentry is NULL. Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: miklos@szeredi.hu Cc: namhyung@kernel.org Cc: songliubraving@fb.com Fixes: 9511bce9fe8e ("perf/core: Fix bad use of igrab()") Link: http://lkml.kernel.org/r/1531782831-1186-1-git-send-email-mathieu.poirier@linaro.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index cdb32cf8e33c..eec2d5fb676b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7337,6 +7337,10 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { + /* d_inode(NULL) won't be equal to any mapped user-space file */ + if (!filter->path.dentry) + return false; + if (d_inode(filter->path.dentry) != file_inode(file)) return false; -- cgit v1.2.1 From 1863c387259b629e4ebfb255495f67cd06aa229b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 24 Jul 2018 19:13:31 -0400 Subject: tracing: Fix double free of event_trigger_data Running the following: # cd /sys/kernel/debug/tracing # echo 500000 > buffer_size_kb [ Or some other number that takes up most of memory ] # echo snapshot > events/sched/sched_switch/trigger Triggers the following bug: ------------[ cut here ]------------ kernel BUG at mm/slub.c:296! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI CPU: 6 PID: 6878 Comm: bash Not tainted 4.18.0-rc6-test+ #1066 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:kfree+0x16c/0x180 Code: 05 41 0f b6 72 51 5b 5d 41 5c 4c 89 d7 e9 ac b3 f8 ff 48 89 d9 48 89 da 41 b8 01 00 00 00 5b 5d 41 5c 4c 89 d6 e9 f4 f3 ff ff <0f> 0b 0f 0b 48 8b 3d d9 d8 f9 00 e9 c1 fe ff ff 0f 1f 40 00 0f 1f RSP: 0018:ffffb654436d3d88 EFLAGS: 00010246 RAX: ffff91a9d50f3d80 RBX: ffff91a9d50f3d80 RCX: ffff91a9d50f3d80 RDX: 00000000000006a4 RSI: ffff91a9de5a60e0 RDI: ffff91a9d9803500 RBP: ffffffff8d267c80 R08: 00000000000260e0 R09: ffffffff8c1a56be R10: fffff0d404543cc0 R11: 0000000000000389 R12: ffffffff8c1a56be R13: ffff91a9d9930e18 R14: ffff91a98c0c2890 R15: ffffffff8d267d00 FS: 00007f363ea64700(0000) GS:ffff91a9de580000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055c1cacc8e10 CR3: 00000000d9b46003 CR4: 00000000001606e0 Call Trace: event_trigger_callback+0xee/0x1d0 event_trigger_write+0xfc/0x1a0 __vfs_write+0x33/0x190 ? handle_mm_fault+0x115/0x230 ? _cond_resched+0x16/0x40 vfs_write+0xb0/0x190 ksys_write+0x52/0xc0 do_syscall_64+0x5a/0x160 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f363e16ab50 Code: 73 01 c3 48 8b 0d 38 83 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 79 db 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e e3 01 00 48 89 04 24 RSP: 002b:00007fff9a4c6378 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f363e16ab50 RDX: 0000000000000009 RSI: 000055c1cacc8e10 RDI: 0000000000000001 RBP: 000055c1cacc8e10 R08: 00007f363e435740 R09: 00007f363ea64700 R10: 0000000000000073 R11: 0000000000000246 R12: 0000000000000009 R13: 0000000000000001 R14: 00007f363e4345e0 R15: 00007f363e4303c0 Modules linked in: ip6table_filter ip6_tables snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device i915 snd_pcm snd_timer i2c_i801 snd soundcore i2c_algo_bit drm_kms_helper 86_pkg_temp_thermal video kvm_intel kvm irqbypass wmi e1000e ---[ end trace d301afa879ddfa25 ]--- The cause is because the register_snapshot_trigger() call failed to allocate the snapshot buffer, and then called unregister_trigger() which freed the data that was passed to it. Then on return to the function that called register_snapshot_trigger(), as it sees it failed to register, it frees the trigger_data again and causes a double free. By calling event_trigger_init() on the trigger_data (which only ups the reference counter for it), and then event_trigger_free() afterward, the trigger_data would not get freed by the registering trigger function as it would only up and lower the ref count for it. If the register trigger function fails, then the event_trigger_free() called after it will free the trigger data normally. Link: http://lkml.kernel.org/r/20180724191331.738eb819@gandalf.local.home Cc: stable@vger.kerne.org Fixes: 93e31ffbf417 ("tracing: Add 'snapshot' event trigger command") Reported-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d18249683682..d18ec0e58be2 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -679,6 +679,8 @@ event_trigger_callback(struct event_command *cmd_ops, goto out_free; out_reg: + /* Up the trigger_data count to make sure reg doesn't free it on failure */ + event_trigger_init(trigger_ops, trigger_data); ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); /* * The above returns on success the # of functions enabled, @@ -686,11 +688,13 @@ event_trigger_callback(struct event_command *cmd_ops, * Consider no functions a failure too. */ if (!ret) { + cmd_ops->unreg(glob, trigger_ops, trigger_data, file); ret = -ENOENT; - goto out_free; - } else if (ret < 0) - goto out_free; - ret = 0; + } else if (ret > 0) + ret = 0; + + /* Down the counter of trigger_data or free it if not used anymore */ + event_trigger_free(trigger_ops, trigger_data); out: return ret; -- cgit v1.2.1 From 73c8d8945505acdcbae137c2e00a1232e0be709f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jul 2018 01:28:15 +0900 Subject: ring_buffer: tracing: Inherit the tracing setting to next ring buffer Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar Cc: Shuah Khan Cc: Tom Zanussi Cc: Hiraku Toyooka Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 16 ++++++++++++++++ kernel/trace/trace.c | 6 ++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6a46af21765c..0b0b688ea166 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3226,6 +3226,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer) return !atomic_read(&buffer->record_disabled); } +/** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87cf25171fb8..823687997b01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); + /* Inherit the recordable setting from trace_buffer */ + if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); -- cgit v1.2.1 From 57ea2a34adf40f3a6e88409aafcf803b8945619a Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 25 Jul 2018 16:20:38 +0200 Subject: tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure If enable_trace_kprobe fails to enable the probe in enable_k(ret)probe it returns an error, but does not unset the tp flags it set previously. This results in a probe being considered enabled and failures like being unable to remove the probe through kprobe_events file since probes_open() expects every probe to be disabled. Link: http://lkml.kernel.org/r/20180725102826.8300-1-asavkov@redhat.com Link: http://lkml.kernel.org/r/20180725142038.4765-1-asavkov@redhat.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 41a7dd420c57 ("tracing/kprobes: Support ftrace_event_file base multibuffer") Acked-by: Masami Hiramatsu Reviewed-by: Josh Poimboeuf Signed-off-by: Artem Savkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 21f718472942..27ace4513c43 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -400,11 +400,10 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { + struct event_file_link *link; int ret = 0; if (file) { - struct event_file_link *link; - link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { ret = -ENOMEM; @@ -424,6 +423,16 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) else ret = enable_kprobe(&tk->rp.kp); } + + if (ret) { + if (file) { + list_del_rcu(&link->list); + kfree(link); + tk->tp.flags &= ~TP_FLAG_TRACE; + } else { + tk->tp.flags &= ~TP_FLAG_PROFILE; + } + } out: return ret; } -- cgit v1.2.1 From 15cc78644d0075e76d59476a4467e7143860f660 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 25 Jul 2018 16:02:06 -0400 Subject: tracing: Fix possible double free in event_enable_trigger_func() There was a case that triggered a double free in event_trigger_callback() due to the called reg() function freeing the trigger_data and then it getting freed again by the error return by the caller. The solution there was to up the trigger_data ref count. Code inspection found that event_enable_trigger_func() has the same issue, but is not as easy to trigger (requires harder to trigger failures). It needs to be solved slightly different as it needs more to clean up when the reg() function fails. Link: http://lkml.kernel.org/r/20180725124008.7008e586@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 7862ad1846e99 ("tracing: Add 'enable_event' and 'disable_event' event trigger commands") Reivewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d18ec0e58be2..5dea177cef53 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1420,6 +1420,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out; } + /* Up the trigger_data count to make sure nothing frees it on failure */ + event_trigger_init(trigger_ops, trigger_data); + if (trigger) { number = strsep(&trigger, ":"); @@ -1470,6 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1480,7 +1484,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); - kfree(trigger_data); + event_trigger_free(trigger_ops, trigger_data); kfree(enable_data); goto out; } -- cgit v1.2.1 From 2519c1bbe38d7acacc9aacba303ca6f97482ed53 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 25 Jul 2018 22:28:56 -0400 Subject: tracing: Quiet gcc warning about maybe unused link variable Commit 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure") added an if statement that depends on another if statement that gcc doesn't see will initialize the "link" variable and gives the warning: "warning: 'link' may be used uninitialized in this function" It is really a false positive, but to quiet the warning, and also to make sure that it never actually is used uninitialized, initialize the "link" variable to NULL and add an if (!WARN_ON_ONCE(!link)) where the compiler thinks it could be used uninitialized. Cc: stable@vger.kernel.org Fixes: 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure") Reported-by: kbuild test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 27ace4513c43..6b71860f3998 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -400,7 +400,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link; + struct event_file_link *link = NULL; int ret = 0; if (file) { @@ -426,7 +426,9 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) if (ret) { if (file) { - list_del_rcu(&link->list); + /* Notice the if is true on not WARN() */ + if (!WARN_ON_ONCE(!link)) + list_del_rcu(&link->list); kfree(link); tk->tp.flags &= ~TP_FLAG_TRACE; } else { -- cgit v1.2.1 From 3e536e222f2930534c252c1cc7ae799c725c5ff9 Mon Sep 17 00:00:00 2001 From: Snild Dolkow Date: Thu, 26 Jul 2018 09:15:39 +0200 Subject: kthread, tracing: Don't expose half-written comm when creating kthreads There is a window for racing when printing directly to task->comm, allowing other threads to see a non-terminated string. The vsnprintf function fills the buffer, counts the truncated chars, then finally writes the \0 at the end. creator other vsnprintf: fill (not terminated) count the rest trace_sched_waking(p): ... memcpy(comm, p->comm, TASK_COMM_LEN) write \0 The consequences depend on how 'other' uses the string. In our case, it was copied into the tracing system's saved cmdlines, a buffer of adjacent TASK_COMM_LEN-byte buffers (note the 'n' where 0 should be): crash-arm64> x/1024s savedcmd->saved_cmdlines | grep 'evenk' 0xffffffd5b3818640: "irq/497-pwr_evenkworker/u16:12" ...and a strcpy out of there would cause stack corruption: [224761.522292] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffff9bf9783c78 crash-arm64> kbt | grep 'comm\|trace_print_context' #6 0xffffff9bf9783c78 in trace_print_context+0x18c(+396) comm (char [16]) = "irq/497-pwr_even" crash-arm64> rd 0xffffffd4d0e17d14 8 ffffffd4d0e17d14: 2f71726900000000 5f7277702d373934 ....irq/497-pwr_ ffffffd4d0e17d24: 726f776b6e657665 3a3631752f72656b evenkworker/u16: ffffffd4d0e17d34: f9780248ff003231 cede60e0ffffff9b 12..H.x......`.. ffffffd4d0e17d44: cede60c8ffffffd4 00000fffffffffd4 .....`.......... The workaround in e09e28671 (use strlcpy in __trace_find_cmdline) was likely needed because of this same bug. Solved by vsnprintf:ing to a local buffer, then using set_task_comm(). This way, there won't be a window where comm is not terminated. Link: http://lkml.kernel.org/r/20180726071539.188015-1-snild@sony.com Cc: stable@vger.kernel.org Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure") Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Snild Dolkow Signed-off-by: Steven Rostedt (VMware) --- kernel/kthread.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 750cb8082694..486dedbd9af5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -325,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; - vsnprintf(task->comm, sizeof(task->comm), namefmt, args); + /* + * task is already visible to other tasks, so updating + * COMM must be protected. + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. -- cgit v1.2.1 From 5f300e8004cb80182a24c0fa488218a4a43e6aac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 26 Jul 2018 09:57:59 -0700 Subject: bpf: btf: Use exact btf value_size match in map_check_btf() The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects '> map->value_size' to ensure map_seq_show_elem() will not access things beyond an array element. Yonghong suggested that using '!=' is a more correct check. The 8 bytes round_up on value_size is stored in array->elem_size. Hence, using '!=' on map->value_size is a proper check. This patch also adds new tests to check the btf array key type and value type. Two of these new tests verify the btf's value_size (the change in this patch). It also fixes two existing tests that wrongly encoded a btf's type size (pprint_test) and the value_type_id (in one of the raw_tests[]). However, that do not affect these two BTF verification tests before or after this test changes. These two tests mainly failed at array creation time after this patch. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Suggested-by: Yonghong Song Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 544e58f5f642..2aa55d030c77 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size > map->value_size) + if (!value_type || value_size != map->value_size) return -EINVAL; return 0; -- cgit v1.2.1 From 15d36fecd0bdc7510b70a0e5ec6671140b3fce0c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 26 Jul 2018 16:37:15 -0700 Subject: mm: disallow mappings that conflict for devm_memremap_pages() When pmem namespaces created are smaller than section size, this can cause an issue during removal and gpf was observed: general protection fault: 0000 1 SMP PTI CPU: 36 PID: 3941 Comm: ndctl Tainted: G W 4.14.28-1.el7uek.x86_64 #2 task: ffff88acda150000 task.stack: ffffc900233a4000 RIP: 0010:__put_page+0x56/0x79 Call Trace: devm_memremap_pages_release+0x155/0x23a release_nodes+0x21e/0x260 devres_release_all+0x3c/0x48 device_release_driver_internal+0x15c/0x207 device_release_driver+0x12/0x14 unbind_store+0xba/0xd8 drv_attr_store+0x27/0x31 sysfs_kf_write+0x3f/0x46 kernfs_fop_write+0x10f/0x18b __vfs_write+0x3a/0x16d vfs_write+0xb2/0x1a1 SyS_write+0x55/0xb9 do_syscall_64+0x79/0x1ae entry_SYSCALL_64_after_hwframe+0x3d/0x0 Add code to check whether we have a mapping already in the same section and prevent additional mappings from being created if that is the case. Link: http://lkml.kernel.org/r/152909478401.50143.312364396244072931.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Cc: Dan Williams Cc: Robert Elliott Cc: Jeff Moyer Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 5857267a4af5..a734b1747466 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -176,10 +176,27 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; + align_end = align_start + align_size - 1; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + is_ram = region_intersects(align_start, align_size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); @@ -199,7 +216,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) mutex_lock(&pgmap_lock); error = 0; - align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { error = __radix_tree_insert(&pgmap_radix, -- cgit v1.2.1 From 31c5bda3a656089f01963d290a40ccda181f816e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Jul 2018 16:37:22 -0700 Subject: mm: fix exports that inadvertently make put_page() EXPORT_SYMBOL_GPL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e76384884344 ("mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS") added two EXPORT_SYMBOL_GPL() symbols, but these symbols are required by the inlined put_page(), thus accidentally making put_page() a GPL export only. This breaks OpenAFS (at least). Mark them EXPORT_SYMBOL() instead. Link: http://lkml.kernel.org/r/153128611970.2928.11310692420711601254.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: e76384884344 ("mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS") Signed-off-by: Dan Williams Reported-by: Joe Gorse Reported-by: John Hubbard Tested-by: Joe Gorse Tested-by: John Hubbard Cc: Jérôme Glisse Cc: Mark Vitale Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a734b1747466..38283363da06 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -321,7 +321,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL_GPL(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); static atomic_t devmap_enable; /* @@ -362,5 +362,5 @@ void __put_devmap_managed_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ -- cgit v1.2.1 From 027232da7c7c1c7f04383f93bd798e475dde5285 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 26 Jul 2018 16:37:25 -0700 Subject: mm: introduce vma_init() Not all VMAs allocated with vm_area_alloc(). Some of them allocated on stack or in data segment. The new helper can be use to initialize VMA properly regardless where it was allocated. Link: http://lkml.kernel.org/r/20180724121139.62570-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Reviewed-by: Andrew Morton Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a191c05e757d..1b27babc4c78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -312,10 +312,8 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (vma) { - vma->vm_mm = mm; - INIT_LIST_HEAD(&vma->anon_vma_chain); - } + if (vma) + vma_init(vma, mm); return vma; } -- cgit v1.2.1 From b305f7ed0f4f494ad6f3ef5667501535d5a8fa31 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 25 Jul 2018 10:26:19 +0800 Subject: audit: fix potential null dereference 'context->module.name' The variable 'context->module.name' may be null pointer when kmalloc return null, so it's better to check it before using to avoid null dereference. Another one more thing this patch does is using kstrdup instead of (kmalloc + strcpy), and signal a lost record via audit_log_lost. Cc: stable@vger.kernel.org # 4.11 Signed-off-by: Yi Wang Reviewed-by: Jiang Biao Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ceb1c4596c51..80d672a11088 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1279,8 +1279,12 @@ static void show_special(struct audit_context *context, int *call_panic) break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); - audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); + if (context->module.name) { + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + } else + audit_log_format(ab, "(null)"); + break; } audit_log_end(ab); @@ -2411,8 +2415,9 @@ void __audit_log_kern_module(char *name) { struct audit_context *context = audit_context(); - context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); - strcpy(context->module.name, name); + context->module.name = kstrdup(name, GFP_KERNEL); + if (!context->module.name) + audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } -- cgit v1.2.1