From 720abae3d68ae966044497dd9ee5ccf3b16e700c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 5 Nov 2015 18:44:18 -0800 Subject: rcu: force alignment on struct callback_head/rcu_head Make struct callback_head aligned to size of pointer. On most architectures it happens naturally due ABI requirements, but some architectures (like CRIS) have weird ABI and we need to ask it explicitly. The alignment is required to guarantee that bits 0 and 1 of @next will be clear under normal conditions -- as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. This guarantee is important for few reasons: - future call_rcu_lazy() will make use of lower bits in the pointer; - the structure shares storage spacer in struct page with @compound_head, which encode PageTail() in bit 0. The guarantee is needed to avoid false-positive PageTail(). False postive PageTail() caused crash on crisv32[1]. It happend due misaligned task_struct->rcu, which was byte-aligned. [1] http://lkml.kernel.org/r/55FAEA67.9000102@roeck-us.net Signed-off-by: Kirill A. Shutemov Reported-by: Guenter Roeck Tested-by: Guenter Roeck Acked-by: Paul E. McKenney Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/types.h b/include/linux/types.h index c314989d9158..70d8500bddf1 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -205,11 +205,25 @@ struct ustat { * struct callback_head - callback structure for use with RCU and task_work * @next: next update requests in a list * @func: actual update function to call after the grace period. + * + * The struct is aligned to size of pointer. On most architectures it happens + * naturally due ABI requirements, but some architectures (like CRIS) have + * weird ABI and we need to ask it explicitly. + * + * The alignment is required to guarantee that bits 0 and 1 of @next will be + * clear under normal conditions -- as long as we use call_rcu(), + * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. + * + * This guarantee is important for few reasons: + * - future call_rcu_lazy() will make use of lower bits in the pointer; + * - the structure shares storage spacer in struct page with @compound_head, + * which encode PageTail() in bit 0. The guarantee is needed to avoid + * false-positive PageTail(). */ struct callback_head { struct callback_head *next; void (*func)(struct callback_head *head); -}; +} __attribute__((aligned(sizeof(void *)))); #define rcu_head callback_head typedef void (*rcu_callback_t)(struct rcu_head *head); -- cgit v1.2.1 From 55537871ef666b4153fd1ef8782e4a13fee142cc Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 5 Nov 2015 18:44:41 -0800 Subject: kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup In many cases of hardlockup reports, it's actually not possible to know why it triggered, because the CPU that got stuck is usually waiting on a resource (with IRQs disabled) in posession of some other CPU is holding. IOW, we are often looking at the stacktrace of the victim and not the actual offender. Introduce sysctl / cmdline parameter that makes it possible to have hardlockup detector perform all-CPU backtrace. Signed-off-by: Jiri Kosina Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 78488e099ce7..7ec5b86735f3 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -73,6 +73,7 @@ extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; +extern int sysctl_hardlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); -- cgit v1.2.1 From ac1f591249d95372f3a5ab3828d4af5dfbf5efd3 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 5 Nov 2015 18:44:44 -0800 Subject: kernel/watchdog.c: add sysctl knob hardlockup_panic The only way to enable a hardlockup to panic the machine is to set 'nmi_watchdog=panic' on the kernel command line. This makes it awkward for end users and folks who want to run automate tests (like myself). Mimic the softlockup_panic knob and create a /proc/sys/kernel/hardlockup_panic knob. Signed-off-by: Don Zickus Cc: Ulrich Obergfell Acked-by: Jiri Kosina Reviewed-by: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c115d617739d..5423b9c82fee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -384,6 +384,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; +extern unsigned int hardlockup_panic; void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) -- cgit v1.2.1 From fda901241fb89449244537db4fb27b06e491b74f Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 5 Nov 2015 18:44:59 -0800 Subject: slab: convert slab_is_available() to boolean A good candidate to return a boolean result. Signed-off-by: Denis Kirjanov Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 7e37d448ed91..7c82e3b307a3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -111,7 +111,7 @@ struct mem_cgroup; * struct kmem_cache related prototypes */ void __init kmem_cache_init(void); -int slab_is_available(void); +bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, -- cgit v1.2.1 From a744fd17b5233360681ce03e43804406745b680b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 5 Nov 2015 18:45:02 -0800 Subject: compiler.h: add support for function attribute assume_aligned gcc 4.9 added the function attribute assume_aligned, indicating to the caller that the returned pointer may be assumed to have a certain minimal alignment. This is useful if, for example, the return value is passed to memset(). Add a shorthand macro for that. Signed-off-by: Rasmus Villemoes Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 17 +++++++++++++++++ include/linux/compiler.h | 8 ++++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 8efb40e61d6e..02fa6176120d 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -210,6 +210,23 @@ #define __visible __attribute__((externally_visible)) #endif + +#if GCC_VERSION >= 40900 +/* + * __assume_aligned(n, k): Tell the optimizer that the returned + * pointer can be assumed to be k modulo n. The second argument is + * optional (default 0), so we use a variadic macro to make the + * shorthand. + * + * Beware: Do not apply this to functions which may return + * ERR_PTRs. Also, it is probably unwise to apply it to functions + * returning extra information in the low bits (but in that case the + * compiler should see some alignment anyway, when the return value is + * massaged by 'flags = ptr & 3; ptr &= ~3;'). + */ +#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) +#endif + /* * GCC 'asm goto' miscompiles certain code sequences: * diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 52a459ff75f4..4dac1036594f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -417,6 +417,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define __visible #endif +/* + * Assume alignment of return value. + */ +#ifndef __assume_aligned +#define __assume_aligned(a, ...) +#endif + + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -- cgit v1.2.1 From 8748dd5c98f7fe506ccb31a094833401ed120915 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 5 Nov 2015 18:45:05 -0800 Subject: include/linux/compiler-gcc.h: hide assume_aligned attribute from sparse The patch "slab.h: sprinkle __assume_aligned attributes" causes *tons* of whinges if you do 'make C=2' with sparse 0.5.0: CHECK drivers/media/usb/pwc/pwc-if.c include/linux/slab.h:307:43: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:308:58: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:337:73: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:375:74: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:378:80: error: attribute '__assume_aligned__': unknown attribute sparse apparently pretends to be gcc >= 4.9, yet isn't prepared to handle all the function attributes supported by those gccs and complains loudly. So hide the definition of __assume_aligned from it (so that the generic one in compiler.h gets used). Signed-off-by: Rasmus Villemoes Reported-by: Valdis Kletnieks Tested-By: Valdis Kletnieks Cc: Christopher Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 02fa6176120d..0e3110a0b771 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -211,7 +211,7 @@ #endif -#if GCC_VERSION >= 40900 +#if GCC_VERSION >= 40900 && !defined(__CHECKER__) /* * __assume_aligned(n, k): Tell the optimizer that the returned * pointer can be assumed to be k modulo n. The second argument is -- cgit v1.2.1 From 0ab32b6f1b88444524e52429fab334ff96683a3f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:46:03 -0800 Subject: uaccess: reimplement probe_kernel_address() using probe_kernel_read() probe_kernel_address() is basically the same as the (later added) probe_kernel_read(). The return value on EFAULT is a bit different: probe_kernel_address() returns number-of-bytes-not-copied whereas probe_kernel_read() returns -EFAULT. All callers have been checked, none cared. probe_kernel_read() can be overridden by the architecture whereas probe_kernel_address() cannot. parisc, blackfin and um do this, to insert additional checking. Hence this patch possibly fixes obscure bugs, although there are only two probe_kernel_address() callsites outside arch/. My first attempt involved removing probe_kernel_address() entirely and converting all callsites to use probe_kernel_read() directly, but that got tiresome. This patch shrinks mm/slab_common.o by 218 bytes. For a single probe_kernel_address() callsite. Cc: Steven Miao Cc: Jeff Dike Cc: Richard Weinberger Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index d6f2c2c5b043..558129af828a 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -75,36 +75,6 @@ static inline unsigned long __copy_from_user_nocache(void *to, #endif /* ARCH_HAS_NOCACHE_UACCESS */ -/** - * probe_kernel_address(): safely attempt to read from a location - * @addr: address to read from - its type is type typeof(retval)* - * @retval: read into this variable - * - * Safely read from address @addr into variable @revtal. If a kernel fault - * happens, handle that and return -EFAULT. - * We ensure that the __get_user() is executed in atomic context so that - * do_page_fault() doesn't attempt to take mmap_sem. This makes - * probe_kernel_address() suitable for use within regions where the caller - * already holds mmap_sem, or other locks which nest inside mmap_sem. - * This must be a macro because __get_user() needs to know the types of the - * args. - * - * We don't include enough header files to be able to do the set_fs(). We - * require that the probe_kernel_address() caller will do that. - */ -#define probe_kernel_address(addr, retval) \ - ({ \ - long ret; \ - mm_segment_t old_fs = get_fs(); \ - \ - set_fs(KERNEL_DS); \ - pagefault_disable(); \ - ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ - pagefault_enable(); \ - set_fs(old_fs); \ - ret; \ - }) - /* * probe_kernel_read(): safely attempt to read from a location * @dst: pointer to the buffer that shall take the data @@ -131,4 +101,14 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +/** + * probe_kernel_address(): safely attempt to read from a location + * @addr: address to read from + * @retval: read into this variable + * + * Returns 0 on success, or -EFAULT. + */ +#define probe_kernel_address(addr, retval) \ + probe_kernel_read(&retval, addr, sizeof(retval)) + #endif /* __LINUX_UACCESS_H__ */ -- cgit v1.2.1 From 626ebc4100285be56fe3546f29b6afeb36b6871a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:09 -0800 Subject: memcg: flatten task_struct->memcg_oom task_struct->memcg_oom is a sub-struct containing fields which are used for async memcg oom handling. Most task_struct fields aren't packaged this way and it can lead to unnecessary alignment paddings. This patch flattens it. * task.memcg_oom.memcg -> task.memcg_in_oom * task.memcg_oom.gfp_mask -> task.memcg_oom_gfp_mask * task.memcg_oom.order -> task.memcg_oom_order * task.memcg_oom.may_oom -> task.memcg_may_oom In addition, task.memcg_may_oom is relocated to where other bitfields are which reduces the size of task_struct. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 +++++----- include/linux/sched.h | 13 ++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e3318ddfc0e..56174c7199ee 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -406,19 +406,19 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, static inline void mem_cgroup_oom_enable(void) { - WARN_ON(current->memcg_oom.may_oom); - current->memcg_oom.may_oom = 1; + WARN_ON(current->memcg_may_oom); + current->memcg_may_oom = 1; } static inline void mem_cgroup_oom_disable(void) { - WARN_ON(!current->memcg_oom.may_oom); - current->memcg_oom.may_oom = 0; + WARN_ON(!current->memcg_may_oom); + current->memcg_may_oom = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { - return p->memcg_oom.memcg; + return p->memcg_in_oom; } bool mem_cgroup_oom_synchronize(bool wait); diff --git a/include/linux/sched.h b/include/linux/sched.h index 5423b9c82fee..17bf8b845aa0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1473,7 +1473,9 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; - +#ifdef CONFIG_MEMCG + unsigned memcg_may_oom:1; +#endif #ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif @@ -1804,12 +1806,9 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG - struct memcg_oom_info { - struct mem_cgroup *memcg; - gfp_t gfp_mask; - int order; - unsigned int may_oom:1; - } memcg_oom; + struct mem_cgroup *memcg_in_oom; + gfp_t memcg_oom_gfp_mask; + int memcg_oom_order; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; -- cgit v1.2.1 From b23afb93d317c65cef553b804f08dec8a7a0f7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:11 -0800 Subject: memcg: punt high overage reclaim to return-to-userland path Currently, try_charge() tries to reclaim memory synchronously when the high limit is breached; however, if the allocation doesn't have __GFP_WAIT, synchronous reclaim is skipped. If a process performs only speculative allocations, it can blow way past the high limit. This is actually easily reproducible by simply doing "find /". slab/slub allocator tries speculative allocations first, so as long as there's memory which can be consumed without blocking, it can keep allocating memory regardless of the high limit. This patch makes try_charge() always punt the over-high reclaim to the return-to-userland path. If try_charge() detects that high limit is breached, it adds the overage to current->memcg_nr_pages_over_high and schedules execution of mem_cgroup_handle_over_high() which performs synchronous reclaim from the return-to-userland path. As long as kernel doesn't have a run-away allocation spree, this should provide enough protection while making kmemcg behave more consistently. It also has the following benefits. - All over-high reclaims can use GFP_KERNEL regardless of the specific gfp mask in use, e.g. GFP_NOFS, when the limit was breached. - It copes with prio inversion. Previously, a low-prio task with small memory.high might perform over-high reclaim with a bunch of locks held. If a higher prio task needed any of these locks, it would have to wait until the low prio task finished reclaim and released the locks. By handing over-high reclaim to the task exit path this issue can be avoided. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ include/linux/sched.h | 3 +++ include/linux/tracehook.h | 3 +++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 56174c7199ee..77bf42966200 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -401,6 +401,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } +void mem_cgroup_handle_over_high(void); + void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -620,6 +622,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { } +static inline void mem_cgroup_handle_over_high(void) +{ +} + static inline void mem_cgroup_oom_enable(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 17bf8b845aa0..055f2ee3b0f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1809,6 +1809,9 @@ struct task_struct { struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; + + /* number of pages to reclaim on returning to userland */ + unsigned int memcg_nr_pages_over_high; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 84d497297c5f..26c152122a42 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -50,6 +50,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) smp_mb__after_atomic(); if (unlikely(current->task_works)) task_work_run(); + + mem_cgroup_handle_over_high(); } #endif /* */ -- cgit v1.2.1 From cbfb479809c1b8d871cb9a31832e065e900a24c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:14 -0800 Subject: memcg: collect kmem bypass conditions into __memcg_kmem_bypass() memcg_kmem_newpage_charge() and memcg_kmem_get_cache() are testing the same series of conditions to decide whether to bypass kmem accounting. Collect the tests into __memcg_kmem_bypass(). This is pure refactoring. Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 77bf42966200..d8174e8d6ab4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -777,20 +777,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -/** - * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. - * @gfp: the gfp allocation flags. - * @memcg: a pointer to the memcg this was charged against. - * @order: allocation order. - * - * returns true if the memcg where the current task belongs can hold this - * allocation. - * - * We return true automatically if this allocation is not to be accounted to - * any memcg. - */ -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; @@ -812,6 +799,26 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) if (unlikely(fatal_signal_pending(current))) return true; + return false; +} + +/** + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. + * @gfp: the gfp allocation flags. + * @memcg: a pointer to the memcg this was charged against. + * @order: allocation order. + * + * returns true if the memcg where the current task belongs can hold this + * allocation. + * + * We return true automatically if this allocation is not to be accounted to + * any memcg. + */ +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + if (__memcg_kmem_bypass(gfp)) + return true; return __memcg_kmem_newpage_charge(gfp, memcg, order); } @@ -854,17 +861,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) static __always_inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { - if (!memcg_kmem_enabled()) - return cachep; - if (gfp & __GFP_NOACCOUNT) - return cachep; - if (gfp & __GFP_NOFAIL) + if (__memcg_kmem_bypass(gfp)) return cachep; - if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) - return cachep; - if (unlikely(fatal_signal_pending(current))) - return cachep; - return __memcg_kmem_get_cache(cachep); } -- cgit v1.2.1 From 7f822c24c2b32a9feafb33e3b9a9e23ef4278c2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:20 -0800 Subject: memcg: drop unnecessary cold-path tests from __memcg_kmem_bypass() __memcg_kmem_bypass() decides whether a kmem allocation should be bypassed to the root memcg. Some conditions that it tests are valid criteria regarding who should be held accountable; however, there are a couple unnecessary tests for cold paths - __GFP_FAIL and fatal_signal_pending(). The previous patch updated try_charge() to handle both __GFP_FAIL and dying tasks correctly and the only thing these two tests are doing is making accounting less accurate and sprinkling tests for cold path conditions in the hot paths. There's nothing meaningful gained by these extra tests. This patch removes the two unnecessary tests from __memcg_kmem_bypass(). Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d8174e8d6ab4..641bb60dd7db 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -781,24 +781,10 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; - if (gfp & __GFP_NOACCOUNT) return true; - /* - * __GFP_NOFAIL allocations will move on even if charging is not - * possible. Therefore we don't even try, and have this allocation - * unaccounted. We could in theory charge it forcibly, but we hope - * those allocations are rare, and won't be worth the trouble. - */ - if (gfp & __GFP_NOFAIL) - return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; - - /* If the test is dying, just let it go. */ - if (unlikely(fatal_signal_pending(current))) - return true; - return false; } -- cgit v1.2.1 From 35bd16a227534cb6ffc9b26a33061c2dcf91934b Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:47:00 -0800 Subject: mm/memblock: make memblock_remove_range() static memblock_remove_range() is only used in the mm/memblock.c, so we can make it static. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index c518eb589260..24daf8fc4d7c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -89,10 +89,6 @@ int memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, unsigned long flags); -int memblock_remove_range(struct memblock_type *type, - phys_addr_t base, - phys_addr_t size); - void __next_mem_range(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, -- cgit v1.2.1 From b171e4093017d4d6e411f5e97823e5e4a21266a2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:06 -0800 Subject: mm/page_alloc: remove unused parameter in init_currently_empty_zone() Commit a2f3aa025766 ("[PATCH] Fix sparsemem on Cell") fixed an oops experienced on the Cell architecture when init-time functions, early_*(), are called at runtime by introducing an 'enum memmap_context' parameter to memmap_init_zone() and init_currently_empty_zone(). This parameter is intended to be used to tell whether the call of these two functions is being made on behalf of a hotplug event, or happening at boot-time. However, init_currently_empty_zone() does not use this parameter at all, so remove it. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d94347737292..2d7e660cdefe 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -823,8 +823,7 @@ enum memmap_context { MEMMAP_HOTPLUG, }; extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, - unsigned long size, - enum memmap_context context); + unsigned long size); extern void lruvec_init(struct lruvec *lruvec); -- cgit v1.2.1 From 600e19afc5f8a6c18ea49cee9511c5797db02391 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 5 Nov 2015 18:47:08 -0800 Subject: mm: use only per-device readahead limit Maximal readahead size is limited now by two values: 1) by global 2Mb constant (MAX_READAHEAD in max_sane_readahead()) 2) by configurable per-device value* (bdi->ra_pages) There are devices, which require custom readahead limit. For instance, for RAIDs it's calculated as number of devices multiplied by chunk size times 2. Readahead size can never be larger than bdi->ra_pages * 2 value (POSIX_FADV_SEQUNTIAL doubles readahead size). If so, why do we need two limits? I suggest to completely remove this max_sane_readahead() stuff and use per-device readahead limit everywhere. Also, using right readahead size for RAID disks can significantly increase i/o performance: before: dd if=/dev/md2 of=/dev/null bs=100M count=100 100+0 records in 100+0 records out 10485760000 bytes (10 GB) copied, 12.9741 s, 808 MB/s after: $ dd if=/dev/md2 of=/dev/null bs=100M count=100 100+0 records in 100+0 records out 10485760000 bytes (10 GB) copied, 8.91317 s, 1.2 GB/s (It's an 8-disks RAID5 storage). This patch doesn't change sys_readahead and madvise(MADV_WILLNEED) behavior introduced by 6d2be915e589b58 ("mm/readahead.c: fix readahead failure for memoryless NUMA nodes and limit readahead pages"). Signed-off-by: Roman Gushchin Cc: Raghavendra K T Cc: Jan Kara Cc: Wu Fengguang Cc: David Rientjes Cc: onstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 80001de019ba..fa08f3cf0f22 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2036,8 +2036,6 @@ void page_cache_async_readahead(struct address_space *mapping, pgoff_t offset, unsigned long size); -unsigned long max_sane_readahead(unsigned long nr); - /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -- cgit v1.2.1 From 5d317b2b6536592a9b51fe65faed43d65ca9158e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:14 -0800 Subject: mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status Currently there's no easy way to get per-process usage of hugetlb pages, which is inconvenient because userspace applications which use hugetlb typically want to control their processes on the basis of how much memory (including hugetlb) they use. So this patch simply provides easy access to the info via /proc/PID/status. Signed-off-by: Naoya Horiguchi Acked-by: Joern Engel Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 19 +++++++++++++++++++ include/linux/mm_types.h | 3 +++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5e35379f58a5..685c262e0be8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -483,6 +483,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, #define hugepages_supported() (HPAGE_SHIFT != 0) #endif +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); + +static inline void hugetlb_count_add(long l, struct mm_struct *mm) +{ + atomic_long_add(l, &mm->hugetlb_usage); +} + +static inline void hugetlb_count_sub(long l, struct mm_struct *mm) +{ + atomic_long_sub(l, &mm->hugetlb_usage); +} #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL @@ -519,6 +530,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, { return &mm->page_table_lock; } + +static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) +{ +} + +static inline void hugetlb_count_sub(long l, struct mm_struct *mm) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3d6baa7d4534..0a85da25a822 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -486,6 +486,9 @@ struct mm_struct { /* address of the bounds directory */ void __user *bd_addr; #endif +#ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; +#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) -- cgit v1.2.1 From aa750fd71c242dba02ee2034e15fbd7d0cdb2461 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Thu, 5 Nov 2015 18:47:23 -0800 Subject: mm/filemap.c: make global sync not clear error status of individual inodes filemap_fdatawait() is a function to wait for on-going writeback to complete but also consume and clear error status of the mapping set during writeback. The latter functionality is critical for applications to detect writeback error with system calls like fsync(2)/fdatasync(2). However filemap_fdatawait() is also used by sync(2) or FIFREEZE ioctl, which don't check error status of individual mappings. As a result, fsync() may not be able to detect writeback error if events happen in the following order: Application System admin ---------------------------------------------------------- write data on page cache Run sync command writeback completes with error filemap_fdatawait() clears error fsync returns success (but the data is not on disk) This patch adds filemap_fdatawait_keep_errors() for call sites where writeback error is not handled so that they don't clear error status. Signed-off-by: Jun'ichi Nomura Acked-by: Andi Kleen Reviewed-by: Tejun Heo Cc: Fengguang Wu Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 72d8a844c692..9355f377fd46 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2422,6 +2422,7 @@ extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); -- cgit v1.2.1 From 13308ca9ef9acb325b52bd8562639b5844f3cbf2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:40 -0800 Subject: mm/memcontrol: make mem_cgroup_inactive_anon_is_low() return bool Make mem_cgroup_inactive_anon_is_low return bool due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 641bb60dd7db..4142f94822ea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -382,7 +382,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { unsigned long inactive_ratio; unsigned long inactive; @@ -585,10 +585,10 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline int +static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { - return 1; + return true; } static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -- cgit v1.2.1 From fa6c7b46aaa0cc00846703e8c0ec1e1636ff25ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:56 -0800 Subject: mm, compaction: export tracepoints status strings to userspace Some compaction tracepoints convert the integer return values to strings using the compaction_status_string array. This works for in-kernel printing, but not userspace trace printing of raw captured trace such as via trace-cmd report. This patch converts the private array to appropriate tracepoint macros that result in proper userspace support. trace-cmd output before: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret= after: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret=partial Signed-off-by: Vlastimil Babka Reviewed-by: Steven Rostedt Cc: Joonsoo Kim Cc: Ingo Molnar Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 2 +- include/trace/events/compaction.h | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index aa8f61cf3a19..f14ba989092f 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -15,7 +15,7 @@ /* For more detailed tracepoint output */ #define COMPACT_NO_SUITABLE_PAGE 5 #define COMPACT_NOT_SUITABLE_ZONE 6 -/* When adding new state, please change compaction_status_string, too */ +/* When adding new states, please adjust include/trace/events/compaction.h */ /* Used to signal whether compaction detected need_sched() or lock contention */ /* No contention detected */ diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 9a6a3fe0fb51..1275a555199a 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -9,6 +9,35 @@ #include #include +#define COMPACTION_STATUS \ + EM( COMPACT_DEFERRED, "deferred") \ + EM( COMPACT_SKIPPED, "skipped") \ + EM( COMPACT_CONTINUE, "continue") \ + EM( COMPACT_PARTIAL, "partial") \ + EM( COMPACT_COMPLETE, "complete") \ + EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ + EMe(COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") + +/* + * First define the enums in the above macros to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +COMPACTION_STATUS + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + DECLARE_EVENT_CLASS(mm_compaction_isolate_template, TP_PROTO( @@ -161,7 +190,7 @@ TRACE_EVENT(mm_compaction_end, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async", - compaction_status_string[__entry->status]) + __print_symbolic(__entry->status, COMPACTION_STATUS)) ); TRACE_EVENT(mm_compaction_try_to_compact_pages, @@ -217,7 +246,7 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template, __entry->nid, __entry->name, __entry->order, - compaction_status_string[__entry->ret]) + __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, -- cgit v1.2.1 From 1743d0506003f7db0602d120ecf63e2747af0d72 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:59 -0800 Subject: mm, compaction: export tracepoints zone names to userspace Some compaction tracepoints use zone->name to print which zone is being compacted. This works for in-kernel printing, but not userspace trace printing of raw captured trace such as via trace-cmd report. This patch uses zone_idx() instead of zone->name as the raw value, and when printing, converts the zone_type to string using the appropriate EM() macros and some ugly tricks to overcome the problem that half the values depend on CONFIG_ options and one does not simply use #ifdef inside of #define. trace-cmd output before: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret=partial after: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=Normal order=9 ret=partial Signed-off-by: Vlastimil Babka Reviewed-by: Steven Rostedt Cc: Joonsoo Kim Cc: Ingo Molnar Cc: Mel Gorman Cc: David Rientjes Cc: Valentin Rothberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 1275a555199a..a43000d76688 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -18,6 +18,31 @@ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ EMe(COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") +#ifdef CONFIG_ZONE_DMA +#define IFDEF_ZONE_DMA(X) X +#else +#define IFDEF_ZONE_DMA(X) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define IFDEF_ZONE_DMA32(X) X +#else +#define IFDEF_ZONE_DMA32(X) +#endif + +#ifdef CONFIG_HIGHMEM +#define IFDEF_ZONE_HIGHMEM(X) X +#else +#define IFDEF_ZONE_HIGHMEM(X) +#endif + +#define ZONE_TYPE \ + IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ + IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ + EM (ZONE_NORMAL, "Normal") \ + IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ + EMe(ZONE_MOVABLE,"Movable") + /* * First define the enums in the above macros to be exported to userspace * via TRACE_DEFINE_ENUM(). @@ -28,6 +53,7 @@ #define EMe(a, b) TRACE_DEFINE_ENUM(a); COMPACTION_STATUS +ZONE_TYPE /* * Now redefine the EM() and EMe() macros to map the enums to the strings @@ -230,21 +256,21 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_STRUCT__entry( __field(int, nid) - __field(char *, name) + __field(enum zone_type, idx) __field(int, order) __field(int, ret) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); - __entry->name = (char *)zone->name; + __entry->idx = zone_idx(zone); __entry->order = order; __entry->ret = ret; ), TP_printk("node=%d zone=%-8s order=%d ret=%s", __entry->nid, - __entry->name, + __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); @@ -276,7 +302,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_STRUCT__entry( __field(int, nid) - __field(char *, name) + __field(enum zone_type, idx) __field(int, order) __field(unsigned int, considered) __field(unsigned int, defer_shift) @@ -285,7 +311,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_fast_assign( __entry->nid = zone_to_nid(zone); - __entry->name = (char *)zone->name; + __entry->idx = zone_idx(zone); __entry->order = order; __entry->considered = zone->compact_considered; __entry->defer_shift = zone->compact_defer_shift; @@ -294,7 +320,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", __entry->nid, - __entry->name, + __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __entry->order_failed, __entry->considered, -- cgit v1.2.1 From 2d1e10412c2388ff9b6afc60536eaa195a419289 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:48:02 -0800 Subject: mm, compaction: distinguish contended status in tracepoints Compaction returns prematurely with COMPACT_PARTIAL when contended or has fatal signal pending. This is ok for the callers, but might be misleading in the traces, as the usual reason to return COMPACT_PARTIAL is that we think the allocation should succeed. After this patch we distinguish the premature ending condition in the mm_compaction_finished and mm_compaction_end tracepoints. The contended status covers the following reasons: - lock contention or need_resched() detected in async compaction - fatal signal pending - too many pages isolated in the zone (only for async compaction) Further distinguishing the exact reason seems unnecessary for now. Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 1 + include/trace/events/compaction.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index f14ba989092f..4cd4ddf64cc7 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -15,6 +15,7 @@ /* For more detailed tracepoint output */ #define COMPACT_NO_SUITABLE_PAGE 5 #define COMPACT_NOT_SUITABLE_ZONE 6 +#define COMPACT_CONTENDED 7 /* When adding new states, please adjust include/trace/events/compaction.h */ /* Used to signal whether compaction detected need_sched() or lock contention */ diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index a43000d76688..c92d1e1cbad9 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -16,7 +16,8 @@ EM( COMPACT_PARTIAL, "partial") \ EM( COMPACT_COMPLETE, "complete") \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ - EMe(COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") + EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ + EMe(COMPACT_CONTENDED, "contended") #ifdef CONFIG_ZONE_DMA #define IFDEF_ZONE_DMA(X) X -- cgit v1.2.1 From da39da3a54fed88e29024f2f1f6cd7357cd03a44 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:48:05 -0800 Subject: mm, oom: remove task_lock protecting comm printing The oom killer takes task_lock() in a couple of places solely to protect printing the task's comm. A process's comm, including current's comm, may change due to /proc/pid/comm or PR_SET_NAME. The comm will always be NULL-terminated, so the worst race scenario would only be during update. We can tolerate a comm being printed that is in the middle of an update to avoid taking the lock. Other locations in the kernel have already dropped task_lock() when printing comm, so this is consistent. Signed-off-by: David Rientjes Suggested-by: Oleg Nesterov Cc: Michal Hocko Cc: Vladimir Davydov Cc: Sergey Senozhatsky Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 1b357997cac5..5a1311942358 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -93,7 +93,7 @@ extern int current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); -extern void cpuset_print_task_mems_allowed(struct task_struct *p); +extern void cpuset_print_current_mems_allowed(void); /* * read_mems_allowed_begin is required when making decisions involving @@ -219,7 +219,7 @@ static inline void rebuild_sched_domains(void) partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_print_task_mems_allowed(struct task_struct *p) +static inline void cpuset_print_current_mems_allowed(void) { } -- cgit v1.2.1 From 3ca65c19ddbb45f504edf92fe7126ecc94d56e36 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 5 Nov 2015 18:48:29 -0800 Subject: mm: optimize PageHighMem() check This came up when implementing HIHGMEM/PAE40 for ARC. The kmap() / kmap_atomic() generated code seemed needlessly bloated due to the way PageHighMem() macro is implemented. It derives the exact zone for page and then does pointer subtraction with first zone to infer the zone_type. The pointer arithmatic in turn generates the code bloat. PageHighMem(page) is_highmem(page_zone(page)) zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones Instead use is_highmem_idx() to work on zone_type available in page flags ----- Before ----- 80756348: mov_s r13,r0 8075634a: ld_s r2,[r13,0] 8075634c: lsr_s r2,r2,30 8075634e: mpy r2,r2,0x2a4 80756352: add_s r2,r2,0x80aef880 80756358: ld_s r3,[r2,28] 8075635a: sub_s r2,r2,r3 8075635c: breq r2,0x2a4,80756378 80756364: breq r2,0x548,80756378 ----- After ----- 80756330: mov_s r13,r0 80756332: ld_s r2,[r13,0] 80756334: lsr_s r2,r2,30 80756336: sub_s r2,r2,1 80756338: brlo r2,2,80756348 For x86 defconfig build (32 bit only) it saves around 900 bytes. For ARC defconfig with HIGHMEM, it saved around 2K bytes. ---->8------- ./scripts/bloat-o-meter x86/vmlinux-defconfig-pre x86/vmlinux-defconfig-post add/remove: 0/0 grow/shrink: 0/36 up/down: 0/-934 (-934) function old new delta saveable_page 162 154 -8 saveable_highmem_page 154 146 -8 skb_gro_reset_offset 147 131 -16 ... ... __change_page_attr_set_clr 1715 1678 -37 setup_data_read 434 394 -40 mon_bin_event 1967 1927 -40 swsusp_save 1148 1105 -43 _set_pages_array 549 493 -56 ---->8------- e.g. For ARC kmap() Signed-off-by: Vineet Gupta Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Jennifer Herbert Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 416509e26d6d..a525e5067484 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -256,7 +256,7 @@ PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ -#define PageHighMem(__p) is_highmem(page_zone(__p)) +#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #else PAGEFLAG_FALSE(HighMem) #endif -- cgit v1.2.1 From c2d42c16ad83006a706d83e51a7268db04af733a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:48:43 -0800 Subject: mm/vmstat.c: uninline node_page_state() With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of stack. Uninlining node_page_state() reduces this to 440 bytes. The stack consumption issue is fixed by newer gcc (4.8.4) however with that compiler this patch reduces the node.o text size from 7314 bytes to 4578. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 82e7db7f7100..49dfe40b3673 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } #ifdef CONFIG_NUMA -/* - * Determine the per node value of a stat item. This function - * is called frequently in a NUMA machine, so try to be as - * frugal as possible. - */ -static inline unsigned long node_page_state(int node, - enum zone_stat_item item) -{ - struct zone *zones = NODE_DATA(node)->node_zones; - - return -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], item) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], item) + -#endif -#ifdef CONFIG_HIGHMEM - zone_page_state(&zones[ZONE_HIGHMEM], item) + -#endif - zone_page_state(&zones[ZONE_NORMAL], item) + - zone_page_state(&zones[ZONE_MOVABLE], item); -} +extern unsigned long node_page_state(int node, enum zone_stat_item item); extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #else -- cgit v1.2.1 From f7ae3a95eab4e6766768cef664d2d429f53bd5f4 Mon Sep 17 00:00:00 2001 From: yalin wang Date: Thu, 5 Nov 2015 18:48:50 -0800 Subject: include/linux/vm_event_item.h: change HIGHMEM_ZONE macro definition Change HIGHMEM_ZONE to be the same as the DMA_ZONE macro. Signed-off-by: yalin wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9246d32dc973..e623d392db0c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -14,12 +14,12 @@ #endif #ifdef CONFIG_HIGHMEM -#define HIGHMEM_ZONE(xx) , xx##_HIGH +#define HIGHMEM_ZONE(xx) xx##_HIGH, #else #define HIGHMEM_ZONE(xx) #endif -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), -- cgit v1.2.1 From d05e83a6f861ad02c2fcba75d4c4cfe49e3bc90f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:48:59 -0800 Subject: memcg: simplify charging kmem pages Charging kmem pages proceeds in two steps. First, we try to charge the allocation size to the memcg the current task belongs to, then we allocate a page and "commit" the charge storing the pointer to the memcg in the page struct. Such a design looks overcomplicated, because there is not much sense in trying charging the allocation before actually allocating a page: we won't be able to consume much memory over the limit even if we charge after doing the actual allocation, besides we already charge user pages post factum, so being pedantic with kmem pages just looks pointless. So this patch simplifies the design by merging the "charge" and the "commit" steps into the same function, which takes the allocated page. Also, rename the charge and uncharge methods to memcg_kmem_charge and memcg_kmem_uncharge and make the charge method return error code instead of bool to conform to mem_cgroup_try_charge. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 69 +++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4142f94822ea..c95246627f87 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -752,11 +752,8 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) * conditions, but because they are pretty simple, they are expected to be * fast. */ -bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, - int order); -void __memcg_kmem_commit_charge(struct page *page, - struct mem_cgroup *memcg, int order); -void __memcg_kmem_uncharge_pages(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); /* * helper for acessing a memcg's index. It will be used as an index in the @@ -789,52 +786,30 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) } /** - * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. - * @gfp: the gfp allocation flags. - * @memcg: a pointer to the memcg this was charged against. - * @order: allocation order. + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order * - * returns true if the memcg where the current task belongs can hold this - * allocation. - * - * We return true automatically if this allocation is not to be accounted to - * any memcg. + * Returns 0 on success, an error code on failure. */ -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +static __always_inline int memcg_kmem_charge(struct page *page, + gfp_t gfp, int order) { if (__memcg_kmem_bypass(gfp)) - return true; - return __memcg_kmem_newpage_charge(gfp, memcg, order); + return 0; + return __memcg_kmem_charge(page, gfp, order); } /** - * memcg_kmem_uncharge_pages: uncharge pages from memcg - * @page: pointer to struct page being freed - * @order: allocation order. + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order */ -static inline void -memcg_kmem_uncharge_pages(struct page *page, int order) +static __always_inline void memcg_kmem_uncharge(struct page *page, int order) { if (memcg_kmem_enabled()) - __memcg_kmem_uncharge_pages(page, order); -} - -/** - * memcg_kmem_commit_charge: embeds correct memcg in a page - * @page: pointer to struct page recently allocated - * @memcg: the memcg structure we charged against - * @order: allocation order. - * - * Needs to be called after memcg_kmem_newpage_charge, regardless of success or - * failure of the allocation. if @page is NULL, this function will revert the - * charges. Otherwise, it will commit @page to @memcg. - */ -static inline void -memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) -{ - if (memcg_kmem_enabled() && memcg) - __memcg_kmem_commit_charge(page, memcg, order); + __memcg_kmem_uncharge(page, order); } /** @@ -878,18 +853,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) return false; } -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) -{ - return true; -} - -static inline void memcg_kmem_uncharge_pages(struct page *page, int order) +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { + return 0; } -static inline void -memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +static inline void memcg_kmem_uncharge(struct page *page, int order) { } -- cgit v1.2.1 From f3ccb2c42297757d2e9b820ad37960462df7b7c1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:01 -0800 Subject: memcg: unify slab and other kmem pages charging We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and uncharging kmem pages to memcg, but currently they are not used for charging slab pages (i.e. they are only used for charging pages allocated with alloc_kmem_pages). The only reason why the slab subsystem uses special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it needs to charge to the memcg of kmem cache while memcg_charge_kmem charges to the memcg that the current task belongs to. To remove this diversity, this patch adds an extra argument to __memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is not NULL, the function tries to charge to the memcg it points to, otherwise it charge to the current context. Next, it makes the slab subsystem use this function to charge slab pages. Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since __memcg_kmem_charge stores a pointer to the memcg in the page struct, we don't need memcg_uncharge_slab anymore and can use free_kmem_pages. Besides, one can now detect which memcg a slab page belongs to by reading /proc/kpagecgroup. Note, this patch switches slab to charge-after-alloc design. Since this design is already used for all other memcg charges, it should not make any difference. [hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c95246627f87..0ba4c86d3b40 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -752,6 +752,8 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) * conditions, but because they are pretty simple, they are expected to be * fast. */ +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); @@ -770,10 +772,6 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep); struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages); -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); - static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) -- cgit v1.2.1 From df4065516b0dbfa35ac0e9b8124d441221c0a285 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:04 -0800 Subject: memcg: simplify and inline __mem_cgroup_from_kmem Before the previous patch ("memcg: unify slab and other kmem pages charging"), __mem_cgroup_from_kmem had to handle two types of kmem - slab pages and pages allocated with alloc_kmem_pages - memcg in the page struct. Now we can unify it. Since after it, this function becomes tiny we can fold it into mem_cgroup_from_kmem. [hughd@google.com: move mem_cgroup_from_kmem into list_lru.c] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0ba4c86d3b40..998311e0c70c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -770,8 +770,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); - static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) @@ -830,13 +828,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } - -static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - if (!memcg_kmem_enabled()) - return NULL; - return __mem_cgroup_from_kmem(ptr); -} #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -882,11 +873,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } - -static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - return NULL; -} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ - -- cgit v1.2.1 From 706874e9096e9d468eed9c2b03c8374e806535f3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:27 -0800 Subject: mm: do not inc NR_PAGETABLE if ptlock_init failed If ALLOC_SPLIT_PTLOCKS is defined, ptlock_init may fail, in which case we shouldn't increment NR_PAGETABLE. Since small allocations, such as ptlock, normally do not fail (currently they can fail if kmemcg is used though), this patch does not really fix anything and should be considered as a code cleanup. Signed-off-by: Vladimir Davydov Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fa08f3cf0f22..3c258f8eb9ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1606,8 +1606,10 @@ static inline void pgtable_init(void) static inline bool pgtable_page_ctor(struct page *page) { + if (!ptlock_init(page)) + return false; inc_zone_page_state(page, NR_PAGETABLE); - return ptlock_init(page); + return true; } static inline void pgtable_page_dtor(struct page *page) -- cgit v1.2.1 From 45637bab30d6e7651737f51aa99417baef4d114a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:40 -0800 Subject: mm: rename mem_cgroup_migrate to mem_cgroup_replace_page After v4.3's commit 0610c25daa3e ("memcg: fix dirty page migration") mem_cgroup_migrate() doesn't have much to offer in page migration: convert migrate_misplaced_transhuge_page() to set_page_memcg() instead. Then rename mem_cgroup_migrate() to mem_cgroup_replace_page(), since its remaining callers are replace_page_cache_page() and shmem_replace_page(): both of whom passed lrucare true, so just eliminate that argument. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 998311e0c70c..c06c70b4404e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -297,8 +297,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare); +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); @@ -537,9 +536,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_migrate(struct page *oldpage, - struct page *newpage, - bool lrucare) +static inline void mem_cgroup_replace_page(struct page *old, struct page *new) { } -- cgit v1.2.1 From 6071ca5201066f4b2a61cfb693dd186d6bc6e9f3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 Nov 2015 18:50:26 -0800 Subject: mm: page_counter: let page_counter_try_charge() return bool page_counter_try_charge() currently returns 0 on success and -ENOMEM on failure, which is surprising behavior given the function name. Make it follow the expected pattern of try_stuff() functions that return a boolean true to indicate success, or false for failure. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Linus Torvalds --- include/linux/page_counter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 17fa4f8de3a6..7e62920a3a94 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -36,9 +36,9 @@ static inline unsigned long page_counter_read(struct page_counter *counter) void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_try_charge(struct page_counter *counter, - unsigned long nr_pages, - struct page_counter **fail); +bool page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_limit(struct page_counter *counter, unsigned long limit); int page_counter_memparse(const char *buf, const char *max, -- cgit v1.2.1 From 5ba97bf9d8754bd984e81c1061fcda682681939e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:50:40 -0800 Subject: mm: remove refresh_cpu_vm_stats() definition for !SMP kernel refresh_cpu_vm_stats(int cpu) is no longer referenced by !SMP kernel since Linux 3.12. Signed-off-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 49dfe40b3673..5dbc8b0ee567 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -247,7 +247,6 @@ static inline void __dec_zone_page_state(struct page *page, #define set_pgdat_percpu_threshold(pgdat, callback) { } -static inline void refresh_cpu_vm_stats(int cpu) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } -- cgit v1.2.1 From a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:33 -0800 Subject: mm: mlock: add new mlock system call With the refactored mlock code, introduce a new system call for mlock. The new call will allow the user to specify what lock states are being added. mlock2 is trivial at the moment, but a follow on patch will add a new mlock state making it useful. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Heiko Carstens Cc: Geert Uytterhoeven Cc: Catalin Marinas Cc: Stephen Rothwell Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 2 ++ include/uapi/asm-generic/unistd.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a460e2ef2843..a156b82dd14c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ee124009e12a..1324b0292ec2 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) __SYSCALL(__NR_userfaultfd, sys_userfaultfd) #define __NR_membarrier 283 __SYSCALL(__NR_membarrier, sys_membarrier) +#define __NR_mlock2 284 +__SYSCALL(__NR_mlock2, sys_mlock2) #undef __NR_syscalls -#define __NR_syscalls 284 +#define __NR_syscalls 285 /* * All syscalls below here should go away really, -- cgit v1.2.1 From de60f5f10c58d4f34b68622442c0e04180367f3f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:36 -0800 Subject: mm: introduce VM_LOCKONFAULT The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking. For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc). This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in. Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c258f8eb9ae..906c46a05707 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ @@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE +/* This mask is used to clear all the VMA flags used by mlock */ +#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_MLOCK 0x1000 /* lock present pages */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); -- cgit v1.2.1 From b0f205c2a3082dd9081f9a94e50658c5fa906ff1 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:39 -0800 Subject: mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage The previous patch introduced a flag that specified pages in a VMA should be placed on the unevictable LRU, but they should not be made present when the area is created. This patch adds the ability to set this state via the new mlock system calls. We add MLOCK_ONFAULT for mlock2 and MCL_ONFAULT for mlockall. MLOCK_ONFAULT will set the VM_LOCKONFAULT modifier for VM_LOCKED. MCL_ONFAULT should be used as a modifier to the two other mlockall flags. When used with MCL_CURRENT, all current mappings will be marked with VM_LOCKED | VM_LOCKONFAULT. When used with MCL_FUTURE, the mm->def_flags will be marked with VM_LOCKED | VM_LOCKONFAULT. When used with both MCL_CURRENT and MCL_FUTURE, all current mappings and mm->def_flags will be marked with VM_LOCKED | VM_LOCKONFAULT. Prior to this patch, mlockall() will unconditionally clear the mm->def_flags any time it is called without MCL_FUTURE. This behavior is maintained after adding MCL_ONFAULT. If a call to mlockall(MCL_FUTURE) is followed by mlockall(MCL_CURRENT), the mm->def_flags will be cleared and new VMAs will be unlocked. This remains true with or without MCL_ONFAULT in either mlockall() invocation. munlock() will unconditionally clear both vma flags. munlockall() unconditionally clears for VMA flags on all VMAs and in the mm->def_flags field. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/mman-common.h | 5 +++++ include/uapi/asm-generic/mman.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ddc3b36f1046..a74dd84bbb6d 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -25,6 +25,11 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +/* + * Flags for mlock + */ +#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ + #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ #define MS_SYNC 4 /* synchronous memory sync */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index e9fe6fd2a074..7162cd4cca73 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -17,5 +17,6 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MCL_ONFAULT 4 /* lock all pages that are faulted in */ #endif /* __ASM_GENERIC_MMAN_H */ -- cgit v1.2.1