From f1782c9bc547754f4bd3043fe8cfda53db85f13f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 10 Apr 2018 16:27:44 -0700 Subject: dcache: account external names as indirectly reclaimable memory I received a report about suspicious growth of unreclaimable slabs on some machines. I've found that it happens on machines with low memory pressure, and these unreclaimable slabs are external names attached to dentries. External names are allocated using generic kmalloc() function, so they are accounted as unreclaimable. But they are held by dentries, which are reclaimable, and they will be reclaimed under the memory pressure. In particular, this breaks MemAvailable calculation, as it doesn't take unreclaimable slabs into account. This leads to a silly situation, when a machine is almost idle, has no memory pressure and therefore has a big dentry cache. And the resulting MemAvailable is too low to start a new workload. To address the issue, the NR_INDIRECTLY_RECLAIMABLE_BYTES counter is used to track the amount of memory, consumed by external names. The counter is increased in the dentry allocation path, if an external name structure is allocated; and it's decreased in the dentry freeing path. To reproduce the problem I've used the following Python script: import os for iter in range (0, 10000000): try: name = ("/some_long_name_%d" % iter) + "_" * 220 os.stat(name) except Exception: pass Without this patch: $ cat /proc/meminfo | grep MemAvailable MemAvailable: 7811688 kB $ python indirect.py $ cat /proc/meminfo | grep MemAvailable MemAvailable: 2753052 kB With the patch: $ cat /proc/meminfo | grep MemAvailable MemAvailable: 7809516 kB $ python indirect.py $ cat /proc/meminfo | grep MemAvailable MemAvailable: 7749144 kB [guro@fb.com: fix indirectly reclaimable memory accounting for CONFIG_SLOB] Link: http://lkml.kernel.org/r/20180312194140.19517-1-guro@fb.com [guro@fb.com: fix indirectly reclaimable memory accounting] Link: http://lkml.kernel.org/r/20180313125701.7955-1-guro@fb.com Link: http://lkml.kernel.org/r/20180305133743.12746-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 593079176123..915816e90049 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void __d_free_external_name(struct rcu_head *head) +{ + struct external_name *name = container_of(head, struct external_name, + u.head); + + mod_node_page_state(page_pgdat(virt_to_page(name)), + NR_INDIRECTLY_RECLAIMABLE_BYTES, + -ksize(name)); + + kfree(name); +} + static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - kfree(external_name(dentry)); - kmem_cache_free(dentry_cache, dentry); + + __d_free_external_name(&external_name(dentry)->u.head); + + kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) @@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + call_rcu(&p->u.head, __d_free_external_name); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1617,6 +1631,7 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { + struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1637,14 +1652,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, - GFP_KERNEL_ACCOUNT); - if (!p) { + + ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); + if (!ext) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); - dname = p->name; + atomic_set(&ext->u.count, 1); + dname = ext->name; } else { dname = dentry->d_iname; } @@ -1683,6 +1698,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } + if (unlikely(ext)) { + pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); + mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, + ksize(ext)); + } + this_cpu_inc(nr_dentry); return dentry; @@ -2770,7 +2791,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + call_rcu(&old_name->u.head, __d_free_external_name); } /* -- cgit v1.2.1 From 0e3dc019143104a6e676287b1e453cccd7add404 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:30:44 -0700 Subject: procfs: add seq_put_hex_ll to speed up /proc/pid/maps seq_put_hex_ll() prints a number in hexadecimal notation and works faster than seq_printf(). == test.py num = 0 with open("/proc/1/maps") as f: while num < 10000 : data = f.read() f.seek(0, 0) num = num + 1 == == Before patch == $ time python test.py real 0m1.561s user 0m0.257s sys 0m1.302s == After patch == $ time python test.py real 0m0.986s user 0m0.279s sys 0m0.707s $ perf -g record python test.py: == Before patch == - 67.42% 2.82% python [kernel.kallsyms] [k] show_map_vma.isra.22 - 64.60% show_map_vma.isra.22 - 44.98% seq_printf - seq_vprintf - vsnprintf + 14.85% number + 12.22% format_decode 5.56% memcpy_erms + 15.06% seq_path + 4.42% seq_pad + 2.45% __GI___libc_read == After patch == - 47.35% 3.38% python [kernel.kallsyms] [k] show_map_vma.isra.23 - 43.97% show_map_vma.isra.23 + 20.84% seq_path - 15.73% show_vma_header_prefix 10.55% seq_put_hex_ll + 2.65% seq_put_decimal_ull 0.95% seq_putc + 6.96% seq_pad + 2.94% __GI___libc_read [avagin@openvz.org: use unsigned int instead of int where it is suitable] Link: http://lkml.kernel.org/r/20180214025619.4005-1-avagin@openvz.org [avagin@openvz.org: v2] Link: http://lkml.kernel.org/r/20180117082050.25406-1-avagin@openvz.org Link: http://lkml.kernel.org/r/20180112185812.7710-1-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 21 ++++++++++++--------- fs/seq_file.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ec6d2983a5cb..b66fc8de7d34 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -287,15 +287,18 @@ static void show_vma_header_prefix(struct seq_file *m, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void diff --git a/fs/seq_file.c b/fs/seq_file.c index eea09f6d8830..cde1bdbf7801 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -715,6 +715,52 @@ overflow: } EXPORT_SYMBOL(seq_put_decimal_ull); +/** + * seq_put_hex_ll - put a number in hexadecimal notation + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @v: the number + * @width: a minimum field width + * + * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) + * + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width) +{ + unsigned int len; + int i; + + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } + + /* If x is 0, the result of __builtin_clzll is undefined */ + if (v == 0) + len = 1; + else + len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; + + if (len < width) + len = width; + + if (m->count + len > m->size) { + seq_set_overflow(m); + return; + } + + for (i = len - 1; i >= 0; i--) { + m->buf[m->count + i] = hex_asc[0xf & v]; + v = v >> 4; + } + m->count += len; +} + void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; -- cgit v1.2.1 From 8cfa67b4d9a9d9a6061f3cfd0e0ed16e66e45984 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:30:47 -0700 Subject: procfs: optimize seq_pad() to speed up /proc/pid/maps seq_printf() is slow and it can be replaced by memset() in this case. == test.py num = 0 with open("/proc/1/maps") as f: while num < 10000 : data = f.read() f.seek(0, 0) num = num + 1 == == Before patch == $ time python test.py real 0m0.986s user 0m0.279s sys 0m0.707s == After patch == $ time python test.py real 0m0.932s user 0m0.261s sys 0m0.669s $ perf record -g python test.py == Before patch == - 47.35% 3.38% python [kernel.kallsyms] [k] show_map_vma.isra.23 - 43.97% show_map_vma.isra.23 + 20.84% seq_path - 15.73% show_vma_header_prefix + 6.96% seq_pad + 2.94% __GI___libc_read == After patch == - 44.01% 0.34% python [kernel.kallsyms] [k] show_pid_map - 43.67% show_pid_map - 42.91% show_map_vma.isra.23 + 21.55% seq_path - 15.68% show_vma_header_prefix + 2.08% seq_pad 0.55% seq_putc Link: http://lkml.kernel.org/r/20180112185812.7710-2-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index cde1bdbf7801..3714ae1d5e1c 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -828,8 +828,14 @@ EXPORT_SYMBOL(seq_write); void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; - if (size > 0) - seq_printf(m, "%*s", size, ""); + if (size > 0) { + if (size + m->count > m->size) { + seq_set_overflow(m); + return; + } + memset(m->buf + m->count, ' ', size); + m->count += size; + } if (c) seq_putc(m, c); } -- cgit v1.2.1 From 68c3411ff4a4cee53cc854c11ed191eaaf1956ba Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 10 Apr 2018 16:30:51 -0700 Subject: proc: get rid of task lock/unlock pair to read umask for the "status" file get_task_umask locks/unlocks the task on its own. The only caller does the same thing immediately after. Utilize the fact the task has to be locked anyway and just do it once. Since there are no other users and the code is short, fold it in. Link: http://lkml.kernel.org/r/1517995608-23683-1-git-send-email-mguzik@redhat.com Signed-off-by: Mateusz Guzik Reviewed-by: Alexey Dobriyan Cc: Konstantin Khlebnikov Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 598803576e4c..851ec0915e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -141,25 +141,12 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[task_state_index(tsk)]; } -static inline int get_task_umask(struct task_struct *tsk) -{ - struct fs_struct *fs; - int umask = -ENOENT; - - task_lock(tsk); - fs = tsk->fs; - if (fs) - umask = fs->umask; - task_unlock(tsk); - return umask; -} - static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g, umask; + int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -177,16 +164,16 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); - umask = get_task_umask(p); - if (umask >= 0) - seq_printf(m, "Umask:\t%#04o\n", umask); - task_lock(p); + if (p->fs) + umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); seq_printf(m, "State:\t%s", get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); -- cgit v1.2.1 From 2f8974243507d9e5b0f214d7668a59a66b93f36c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:30:54 -0700 Subject: proc: do less stuff under ->pde_unload_lock Commit ca469f35a8e9ef ("deal with races between remove_proc_entry() and proc_reg_release()") moved too much stuff under ->pde_unload_lock making a problem described at series "[PATCH v5] procfs: Improve Scaling in proc" worse. While RCU is being figured out, move kfree() out of ->pde_unload_lock. On my potato, difference is only 0.5% speedup with concurrent open+read+close of /proc/cmdline, but the effect should be more noticeable on more capable machines. $ perf stat -r 16 -- ./proc-j 16 Performance counter stats for './proc-j 16' (16 runs): 130569.502377 task-clock (msec) # 15.872 CPUs utilized ( +- 0.05% ) 19,169 context-switches # 0.147 K/sec ( +- 0.18% ) 15 cpu-migrations # 0.000 K/sec ( +- 3.27% ) 437 page-faults # 0.003 K/sec ( +- 1.25% ) 300,172,097,675 cycles # 2.299 GHz ( +- 0.05% ) 96,793,267,308 instructions # 0.32 insn per cycle ( +- 0.04% ) 22,798,342,298 branches # 174.607 M/sec ( +- 0.04% ) 111,764,687 branch-misses # 0.49% of all branches ( +- 0.47% ) 8.226574400 seconds time elapsed ( +- 0.05% ) ^^^^^^^^^^^ $ perf stat -r 16 -- ./proc-j 16 Performance counter stats for './proc-j 16' (16 runs): 129866.777392 task-clock (msec) # 15.869 CPUs utilized ( +- 0.04% ) 19,154 context-switches # 0.147 K/sec ( +- 0.66% ) 14 cpu-migrations # 0.000 K/sec ( +- 1.73% ) 431 page-faults # 0.003 K/sec ( +- 1.09% ) 298,556,520,546 cycles # 2.299 GHz ( +- 0.04% ) 96,525,366,833 instructions # 0.32 insn per cycle ( +- 0.04% ) 22,730,194,043 branches # 175.027 M/sec ( +- 0.04% ) 111,506,074 branch-misses # 0.49% of all branches ( +- 0.18% ) 8.183629778 seconds time elapsed ( +- 0.04% ) ^^^^^^^^^^^ Link: http://lkml.kernel.org/r/20180213132911.GA24298@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6e8724958116..8118ce5df5c6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -138,7 +138,7 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked */ +/* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { /* @@ -157,9 +157,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); - spin_lock(&pde->pde_unload_lock); } else { struct file *file; + struct completion *c; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; @@ -167,8 +168,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (unlikely(pdeo->c)) - complete(pdeo->c); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); kfree(pdeo); } } @@ -188,6 +191,7 @@ void proc_entry_rundown(struct proc_dir_entry *de) struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } @@ -375,7 +379,7 @@ static int proc_reg_release(struct inode *inode, struct file *file) list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); - break; + return 0; } } spin_unlock(&pde->pde_unload_lock); -- cgit v1.2.1 From e74a0effffbbea75fe2b6770948f84fcb0917cdd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:30:58 -0700 Subject: proc: move /proc/sysvipc creation to where it belongs Move the proc_mkdir() call within the sysvipc subsystem such that we avoid polluting proc_root_init() with petty cpp. [dave@stgolabs.net: contributed changelog] Link: http://lkml.kernel.org/r/20180216161732.GA10297@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Acked-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index ede8e64974be..4a19e02c7ed0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -136,10 +136,6 @@ void __init proc_root_init(void) proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); - -#ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); -#endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ -- cgit v1.2.1 From e7a6e291e30a00061c356bbcba0d9380943a1671 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:01 -0700 Subject: proc: faster open/close of files without ->release hook The whole point of code in fs/proc/inode.c is to make sure ->release hook is called either at close() or at rmmod time. All if it is unnecessary if there is no ->release hook. Save allocation+list manipulations under spinlock in that case. Link: http://lkml.kernel.org/r/20180214063033.GA15579@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8118ce5df5c6..0331ddbee4f6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -342,31 +342,36 @@ static int proc_reg_open(struct inode *inode, struct file *file) * * Save every "struct file" with custom ->release hook. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); - if (!pdeo) - return -ENOMEM; - - if (!use_pde(pde)) { - kfree(pdeo); + if (!use_pde(pde)) return -ENOENT; - } - open = pde->proc_fops->open; + release = pde->proc_fops->release; + if (release) { + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } + } + open = pde->proc_fops->open; if (open) rv = open(inode, file); - if (rv == 0 && release) { - /* To know what to release. */ - pdeo->file = file; - pdeo->closing = false; - pdeo->c = NULL; - spin_lock(&pde->pde_unload_lock); - list_add(&pdeo->lh, &pde->pde_openers); - spin_unlock(&pde->pde_unload_lock); - } else - kfree(pdeo); + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kfree(pdeo); + } +out_unuse: unuse_pde(pde); return rv; } -- cgit v1.2.1 From a9fabc3df4c68e05f023c6a5189f0104e200beca Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:05 -0700 Subject: proc: randomize "struct pde_opener" The more the merrier. Link: http://lkml.kernel.org/r/20180214081935.GA17157@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d697c8ab0a14..713d5dfe3f05 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -177,7 +177,7 @@ struct pde_opener { struct list_head lh; bool closing; struct completion *c; -}; +} __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; -- cgit v1.2.1 From 195b8cf0689554db764f459730c81f741887aa5f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:09 -0700 Subject: proc: move "struct pde_opener" to kmem cache "struct pde_opener" is fixed size and we can have more granular approach to debugging. For those who don't know, per cache SLUB poisoning and red zoning don't work if there is at least one object allocated which is hopeless in case of kmalloc-64 but not in case of standalone cache. Although systemd opens 2 files from the get go, so it is hopeless after all. Link: http://lkml.kernel.org/r/20180214082306.GB17157@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 12 ++++++++---- fs/proc/internal.h | 2 +- fs/proc/root.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 0331ddbee4f6..5349eb07ac29 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode) } static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -92,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void __init proc_init_inodecache(void) +void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), @@ -100,6 +101,9 @@ void __init proc_init_inodecache(void) SLAB_MEM_SPREAD|SLAB_ACCOUNT| SLAB_PANIC), init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_PANIC, NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) @@ -172,7 +176,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_unlock(&pde->pde_unload_lock); if (unlikely(c)) complete(c); - kfree(pdeo); + kmem_cache_free(pde_opener_cache, pdeo); } } @@ -347,7 +351,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) release = pde->proc_fops->release; if (release) { - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { rv = -ENOMEM; goto out_unuse; @@ -368,7 +372,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); } else - kfree(pdeo); + kmem_cache_free(pde_opener_cache, pdeo); } out_unuse: diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 713d5dfe3f05..dc00ef8538cb 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -182,7 +182,7 @@ extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; -extern void proc_init_inodecache(void); +void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); diff --git a/fs/proc/root.c b/fs/proc/root.c index 4a19e02c7ed0..98797b762a71 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -125,7 +125,7 @@ void __init proc_root_init(void) { int err; - proc_init_inodecache(); + proc_init_kmemcache(); set_proc_pid_nlink(); err = register_filesystem(&proc_fs_type); if (err) -- cgit v1.2.1 From 2acddbe8168967adebf4623923242c9a4f9e1aee Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:12 -0700 Subject: proc: account "struct pde_opener" The allocation is persistent in fact as any fool can open a file in /proc and sit on it. Link: http://lkml.kernel.org/r/20180214082409.GC17157@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 5349eb07ac29..89618836887d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -103,7 +103,7 @@ void __init proc_init_kmemcache(void) init_once); pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, - SLAB_PANIC, NULL); + SLAB_ACCOUNT|SLAB_PANIC, NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) -- cgit v1.2.1 From d1be35cb6f96975d792a1535d3fe9b75239065ee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:16 -0700 Subject: proc: add seq_put_decimal_ull_width to speed up /proc/pid/smaps seq_put_decimal_ull_w(m, str, val, width) prints a decimal number with a specified minimal field width. It is equivalent of seq_printf(m, "%s%*d", str, width, val), but it works much faster. == test_smaps.py num = 0 with open("/proc/1/smaps") as f: for x in xrange(10000): data = f.read() f.seek(0, 0) == == Before patch == $ time python test_smaps.py real 0m4.593s user 0m0.398s sys 0m4.158s == After patch == $ time python test_smaps.py real 0m3.828s user 0m0.413s sys 0m3.408s $ perf -g record python test_smaps.py == Before patch == - 79.01% 3.36% python [kernel.kallsyms] [k] show_smap.isra.33 - 75.65% show_smap.isra.33 + 48.85% seq_printf + 15.75% __walk_page_range + 9.70% show_map_vma.isra.23 0.61% seq_puts == After patch == - 75.51% 4.62% python [kernel.kallsyms] [k] show_smap.isra.33 - 70.88% show_smap.isra.33 + 24.82% seq_put_decimal_ull_w + 19.78% __walk_page_range + 12.74% seq_printf + 11.08% show_map_vma.isra.23 + 1.68% seq_puts [akpm@linux-foundation.org: fix drivers/of/unittest.c build] Link: http://lkml.kernel.org/r/20180212074931.7227-1-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 15 +------ fs/proc/task_mmu.c | 127 ++++++++++++++++++++++------------------------------- fs/seq_file.c | 28 +++++++++--- 3 files changed, 74 insertions(+), 96 deletions(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6bb20f864259..65a72ab57471 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { - char v[32]; - static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; - int len; - - len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); - - seq_write(m, s, 16); - - if (len > 0) { - if (len < 8) - seq_write(m, blanks, 8 - len); - - seq_write(m, v, len); - } + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b66fc8de7d34..3026feda0432 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -24,6 +24,8 @@ #include #include "internal.h" +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; @@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - seq_printf(m, - "VmPeak:\t%8lu kB\n" - "VmSize:\t%8lu kB\n" - "VmLck:\t%8lu kB\n" - "VmPin:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" - "VmRSS:\t%8lu kB\n" - "RssAnon:\t%8lu kB\n" - "RssFile:\t%8lu kB\n" - "RssShmem:\t%8lu kB\n" - "VmData:\t%8lu kB\n" - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n" - "VmSwap:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->pinned_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - anon << (PAGE_SHIFT-10), - file << (PAGE_SHIFT-10), - shmem << (PAGE_SHIFT-10), - mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), - text >> 10, - lib >> 10, - mm_pgtables_bytes(mm) >> 10, - swap << (PAGE_SHIFT-10)); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } +#undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { @@ -739,6 +730,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) { } +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -812,51 +805,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) ret = SEQ_SKIP; } - if (!rollup_mode) - seq_printf(m, - "Size: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); - - - if (!rollup_mode || last_vma) - seq_printf(m, - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "Locked: %8lu kB\n", - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10, - mss->anonymous >> 10, - mss->lazyfree >> 10, - mss->anonymous_thp >> 10, - mss->shmem_thp >> 10, - mss->shared_hugetlb >> 10, - mss->private_hugetlb >> 10, - mss->swap >> 10, - (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), - (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + if (!rollup_mode) { + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + } + if (!rollup_mode || last_vma) { + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + seq_puts(m, " kB\n"); + } if (!rollup_mode) { arch_show_smap(m, vma); show_smap_vma_flags(m, vma); @@ -864,6 +840,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) m_cache_vma(m, vma); return ret; } +#undef SEQ_PUT_DEC static int show_pid_smap(struct seq_file *m, void *v) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 3714ae1d5e1c..84650ad3b1bf 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -673,15 +673,20 @@ void seq_puts(struct seq_file *m, const char *s) } EXPORT_SYMBOL(seq_puts); -/* +/** * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put strlen(delimiter) + number into seq_file. + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @num: the number + * @width: a minimum field width + * + * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, - unsigned long long num) +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width) { int len; @@ -695,7 +700,10 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, memcpy(m->buf + m->count, delimiter, len); m->count += len; - if (m->count + 1 >= m->size) + if (!width) + width = 1; + + if (m->count + width >= m->size) goto overflow; if (num < 10) { @@ -703,7 +711,7 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -713,6 +721,12 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, overflow: seq_set_overflow(m); } + +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, + unsigned long long num) +{ + return seq_put_decimal_ull_width(m, delimiter, num, 0); +} EXPORT_SYMBOL(seq_put_decimal_ull); /** @@ -788,7 +802,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; -- cgit v1.2.1 From f66406638fffe874c56e7e41106167c5235f251e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:19 -0700 Subject: proc: replace seq_printf on seq_putc to speed up /proc/pid/smaps seq_putc() works much faster than seq_printf() == Before patch == $ time python test_smaps.py real 0m3.828s user 0m0.413s sys 0m3.408s == After patch == $ time python test_smaps.py real 0m3.405s user 0m0.401s sys 0m3.003s == Before patch == - 75.51% 4.62% python [kernel.kallsyms] [k] show_smap.isra.33 - 70.88% show_smap.isra.33 + 24.82% seq_put_decimal_ull_aligned + 19.78% __walk_page_range + 12.74% seq_printf + 11.08% show_map_vma.isra.23 + 1.68% seq_puts == After patch == - 69.16% 5.70% python [kernel.kallsyms] [k] show_smap.isra.33 - 63.46% show_smap.isra.33 + 25.98% seq_put_decimal_ull_aligned + 20.90% __walk_page_range + 12.60% show_map_vma.isra.23 1.56% seq_putc + 1.55% seq_puts Link: http://lkml.kernel.org/r/20180212074931.7227-2-avagin@openvz.org Signed-off-by: Andrei Vagin Reviewed-by: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3026feda0432..65ae54659833 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -688,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) { - seq_printf(m, "%c%c ", - mnemonics[i][0], mnemonics[i][1]); + seq_putc(m, mnemonics[i][0]); + seq_putc(m, mnemonics[i][1]); + seq_putc(m, ' '); } } seq_putc(m, '\n'); -- cgit v1.2.1 From 48dffbf82d2f17bc6dd3c2b7fd733738ea567914 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:23 -0700 Subject: proc: optimize single-symbol delimiters to spead up seq_put_decimal_ull A delimiter is a string which is printed before a number. A syngle-symbol delimiters can be printed by set_putc() and this works faster than printing by set_puts(). == test_proc.c int main(int argc, char **argv) { int n, i, fd; char buf[16384]; n = atoi(argv[1]); for (i = 0; i < n; i++) { fd = open(argv[2], O_RDONLY); if (fd < 0) return 1; if (read(fd, buf, sizeof(buf)) <= 0) return 1; close(fd); } return 0; } == $ time ./test_proc 1000000 /proc/1/stat == Before patch == real 0m3.820s user 0m0.337s sys 0m3.394s == After patch == real 0m3.110s user 0m0.324s sys 0m2.700s Link: http://lkml.kernel.org/r/20180212074931.7227-3-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 84650ad3b1bf..0677e89f3c6f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -693,12 +693,12 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (!width) width = 1; @@ -782,12 +782,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (m->count + 2 >= m->size) goto overflow; -- cgit v1.2.1 From d0f02231222b313d1b49278cd2e3c7e7406fea6d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 10 Apr 2018 16:31:26 -0700 Subject: proc: replace seq_printf by seq_put_smth to speed up /proc/pid/status seq_printf() works slower than seq_puts, seq_puts, etc. == test_proc.c int main(int argc, char **argv) { int n, i, fd; char buf[16384]; n = atoi(argv[1]); for (i = 0; i < n; i++) { fd = open(argv[2], O_RDONLY); if (fd < 0) return 1; if (read(fd, buf, sizeof(buf)) <= 0) return 1; close(fd); } return 0; } == $ time ./test_proc 1000000 /proc/1/status == Before path == real 0m5.171s user 0m0.328s sys 0m4.783s == After patch == real 0m4.761s user 0m0.334s sys 0m4.366s Link: http://lkml.kernel.org/r/20180212074931.7227-4-avagin@openvz.org Signed-off-by: Andrei Vagin Cc: Alexey Dobriyan Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 851ec0915e4c..ae2c807fd719 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -174,7 +174,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (umask >= 0) seq_printf(m, "Umask:\t%#04o\n", umask); - seq_printf(m, "State:\t%s", get_task_state(p)); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); @@ -300,8 +301,8 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { - seq_printf(m, "%08x", - a->cap[CAP_LAST_U32 - __capi]); + seq_put_hex_ll(m, NULL, + a->cap[CAP_LAST_U32 - __capi], 8); } seq_putc(m, '\n'); } @@ -355,7 +356,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { - seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -491,7 +493,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(task->real_start_time); - seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + seq_puts(m, tcomm); + seq_puts(m, ") "); + seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); -- cgit v1.2.1 From 24b2ec21192c963c17a1b687b6171e95e8b59c06 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:30 -0700 Subject: proc: check permissions earlier for /proc/*/wchan get_wchan() accesses stack page before permissions are checked, let's not play this game. Link: http://lkml.kernel.org/r/20180217071923.GA16074@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d53246863cfb..d8b5a1653444 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, unsigned long wchan; char symname[KSYM_NAME_LEN]; - wchan = get_wchan(task); + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) - && !lookup_symbol_name(wchan, symname)) + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { seq_printf(m, "%s", symname); - else - seq_putc(m, '0'); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ -- cgit v1.2.1 From 21dae0ad07e6c4d3fa1bd9a91a8b51be316a5111 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:34 -0700 Subject: proc: use set_puts() at /proc/*/wchan Link: http://lkml.kernel.org/r/20180217072011.GB16074@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d8b5a1653444..e9e7652b77da 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -393,7 +393,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { - seq_printf(m, "%s", symname); + seq_puts(m, symname); return 0; } -- cgit v1.2.1 From a0b0d1c345d0317efe594df268feb5ccc99f651e Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 10 Apr 2018 16:31:38 -0700 Subject: fs/proc/proc_sysctl.c: fix potential page fault while unregistering sysctl table proc_sys_link_fill_cache() does not take currently unregistering sysctl tables into account, which might result into a page fault in sysctl_follow_link() - add a check to fix it. This bug has been present since v3.4. Link: http://lkml.kernel.org/r/20180228013506.4915-1-danilokrummrich@dk-develop.de Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets") Signed-off-by: Danilo Krummrich Acked-by: Kees Cook Reviewed-by: Andrew Morton Cc: "Luis R . Rodriguez" Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c41ab261397d..7da10e595297 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -707,7 +707,10 @@ static bool proc_sys_link_fill_cache(struct file *file, struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ -- cgit v1.2.1 From 835b94e05c92e6e8df48112770e624cee192a057 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 10 Apr 2018 16:31:41 -0700 Subject: fs/proc/proc_sysctl.c: remove redundant link check in proc_sys_link_fill_cache() proc_sys_link_fill_cache() does not need to check whether we're called for a link - it's already done by scan(). Link: http://lkml.kernel.org/r/20180228013506.4915-2-danilokrummrich@dk-develop.de Signed-off-by: Danilo Krummrich Acked-by: Kees Cook Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: "Luis R . Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 7da10e595297..4654fc3c246f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -712,12 +712,9 @@ static bool proc_sys_link_fill_cache(struct file *file, if (IS_ERR(head)) return false; - if (S_ISLNK(table->mode)) { - /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table); - if (err) - goto out; - } + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: -- cgit v1.2.1 From b4884f23331ae31e9ecb617956986c3b76ab9a91 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:52 -0700 Subject: proc: move "struct proc_dir_entry" into kmem cache "struct proc_dir_entry" is variable sized because of 0-length trailing array for name, however, because of SLAB padding allocations it is possible to make "struct proc_dir_entry" fixed sized and allocate same amount of memory. It buys fine-grained debugging with poisoning and usercopy protection which is not possible with kmalloc-* caches. Currently, on 32-bit 91+ byte allocations go into kmalloc-128 and on 64-bit 147+ byte allocations go to kmalloc-192 anyway. Additional memory is allocated only for 38/46+ byte long names which are rare or may not even exist in the wild. Link: http://lkml.kernel.org/r/20180223205504.GA17139@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 50 ++++++++++++++++++++++++++++++++------------------ fs/proc/inode.c | 4 ++++ fs/proc/internal.h | 11 ++++++++++- fs/proc/proc_net.c | 7 ++++--- fs/proc/root.c | 3 ++- 5 files changed, 52 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5d709fa8f3a2..800247a256c9 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -8,6 +8,7 @@ * Copyright (C) 1997 Theodore Ts'o */ +#include #include #include #include @@ -28,6 +29,17 @@ static DEFINE_RWLOCK(proc_subdir_lock); +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) @@ -363,10 +375,20 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, return NULL; } - ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; + if (qstr.len + 1 <= sizeof(ent->inline_name)) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; @@ -395,12 +417,11 @@ struct proc_dir_entry *proc_symlink(const char *name, strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { - kfree(ent->data); - kfree(ent); + pde_free(ent); ent = NULL; } } else { - kfree(ent); + pde_free(ent); ent = NULL; } } @@ -423,7 +444,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -458,7 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_iops = NULL; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -495,7 +516,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out_free; return pde; out_free: - kfree(pde); + pde_free(pde); out: return NULL; } @@ -522,19 +543,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) } EXPORT_SYMBOL(proc_set_user); -static void free_proc_entry(struct proc_dir_entry *de) -{ - proc_free_inum(de->low_ino); - - if (S_ISLNK(de->mode)) - kfree(de->data); - kfree(de); -} - void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) - free_proc_entry(pde); + if (atomic_dec_and_test(&pde->count)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 89618836887d..2cf3b74391ca 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -104,6 +104,10 @@ void __init proc_init_kmemcache(void) pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + sizeof_field(struct proc_dir_entry, inline_name), NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index dc00ef8538cb..0ead00771384 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,11 +52,20 @@ struct proc_dir_entry { struct proc_dir_entry *parent; struct rb_root_cached subdir; struct rb_node subdir_node; + char *name; umode_t mode; u8 namelen; - char name[]; +#ifdef CONFIG_64BIT +#define SIZEOF_PDE_INLINE_NAME (192-147) +#else +#define SIZEOF_PDE_INLINE_NAME (128-91) +#endif + char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 68c06ae7888c..e5fe3d400737 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,7 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; @@ -201,6 +201,7 @@ static __net_init int proc_net_ns_init(struct net *net) netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; + netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); @@ -223,7 +224,7 @@ static __net_init int proc_net_ns_init(struct net *net) return 0; free_net: - kfree(netd); + pde_free(netd); out: return err; } @@ -231,7 +232,7 @@ out: static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); - kfree(net->proc_net); + pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { diff --git a/fs/proc/root.c b/fs/proc/root.c index 98797b762a71..cd45abfbb6cc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -208,7 +208,8 @@ struct proc_dir_entry proc_root = { .proc_fops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT_CACHED, - .name = "/proc", + .name = proc_root.inline_name, + .inline_name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) -- cgit v1.2.1 From 35318db566e18ee3ada7e2d62192e5e87b1b5e4b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:41:14 -0700 Subject: proc: fix /proc/*/map_files lookup some more I totally forgot that _parse_integer() accepts arbitrary amount of leading zeroes leading to the following lookups: OK # readlink /proc/1/map_files/56427ecba000-56427eddc000 /lib/systemd/systemd bogus # readlink /proc/1/map_files/00000000000056427ecba000-56427eddc000 /lib/systemd/systemd # readlink /proc/1/map_files/56427ecba000-00000000000056427eddc000 /lib/systemd/systemd Link: http://lkml.kernel.org/r/20180303215130.GA23480@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Cyrill Gorcunov Reviewed-by: Andrew Morton Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index e9e7652b77da..d413a138dc30 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1913,6 +1913,8 @@ static int dname_to_vma_addr(struct dentry *dentry, unsigned long long sval, eval; unsigned int len; + if (str[0] == '0' && str[1] != '-') + return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -1924,6 +1926,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return -EINVAL; str++; + if (str[0] == '0' && str[1]) + return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; -- cgit v1.2.1 From 1539d584e488538451526da039fa554fdeea1177 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:31:57 -0700 Subject: proc: register filesystem last As soon as register_filesystem() exits, filesystem can be mounted. It is better to present fully operational /proc. Of course it doesn't matter because /proc is not modular but do it anyway. Drop error check, it should be handled by panicking. Link: http://lkml.kernel.org/r/20180309222709.GA3843@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index cd45abfbb6cc..9e99204a0704 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -123,14 +123,8 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - int err; - proc_init_kmemcache(); set_proc_pid_nlink(); - err = register_filesystem(&proc_fs_type); - if (err) - return; - proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); @@ -146,6 +140,8 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + + register_filesystem(&proc_fs_type); } static int proc_root_getattr(const struct path *path, struct kstat *stat, -- cgit v1.2.1 From 58c501aab3e54b99eac632a2f5ab5f53e0c27948 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:01 -0700 Subject: proc: faster /proc/cmdline Use seq_puts() and skip format string processing. Link: http://lkml.kernel.org/r/20180309222948.GB3843@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/cmdline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 403cbb12a6e9..8233e7af9389 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -6,7 +6,8 @@ static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); return 0; } -- cgit v1.2.1 From fe079a5e102cc59b6c2b66a41e39c624ce284519 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:05 -0700 Subject: proc: do mmput ASAP for /proc/*/map_files mm_struct is not needed while printing as all the data was already extracted. Link: http://lkml.kernel.org/r/20180309223120.GC3843@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d413a138dc30..eafa39a3a88c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2211,6 +2211,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } } up_read(&mm->mmap_sem); + mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2228,7 +2229,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } if (fa) flex_array_free(fa); - mmput(mm); out_put_task: put_task_struct(task); -- cgit v1.2.1 From b77d70db659ad3aa662c80cff4475e773a531fbe Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:11 -0700 Subject: proc: reject "." and ".." as filenames Various subsystems can create files and directories in /proc with names directly controlled by userspace. Which means "/", "." and ".." are no-no. "/" split is already taken care of, do the other 2 prohibited names. Link: http://lkml.kernel.org/r/20180310001223.GB12443@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Florian Westphal Cc: Eric Dumazet Cc: Cong Wang Cc: Pavel Machek Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 800247a256c9..5dad2e89007b 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -366,6 +366,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "name len %u\n", qstr.len); return NULL; } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; -- cgit v1.2.1 From 9cdd83e3100651af41631fb66838adcd24032f2a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:14 -0700 Subject: proc: switch struct proc_dir_entry::count to refcount ->count is honest reference count unlike ->in_use. Link: http://lkml.kernel.org/r/20180313174550.GA4332@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 4 ++-- fs/proc/internal.h | 5 +++-- fs/proc/root.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5dad2e89007b..fc0333fd5676 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -402,7 +402,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->mode = mode; ent->nlink = nlink; ent->subdir = RB_ROOT_CACHED; - atomic_set(&ent->count, 1); + refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); @@ -553,7 +553,7 @@ EXPORT_SYMBOL(proc_set_user); void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) { + if (refcount_dec_and_test(&pde->refcnt)) { proc_free_inum(pde->low_ino); pde_free(pde); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0ead00771384..b7024f174778 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,7 @@ struct proc_dir_entry { * negative -> it's going away RSN */ atomic_t in_use; - atomic_t count; /* use count */ + refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; @@ -168,7 +169,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry * static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { - atomic_inc(&pde->count); + refcount_inc(&pde->refcnt); return pde; } extern void pde_put(struct proc_dir_entry *); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9e99204a0704..76c996457ff9 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -199,7 +199,7 @@ struct proc_dir_entry proc_root = { .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, - .count = ATOMIC_INIT(1), + .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, -- cgit v1.2.1 From 4f1134370a29a5f2d0f4b4be4c5e2fddd38f0f9d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:20 -0700 Subject: proc: use slower rb_first() In a typical for /proc "open+read+close" usecase, dentry is looked up successfully on open only to be killed in dput() on close. In fact dentries which aren't /proc/*/... and /proc/sys/* were almost NEVER CACHED. Simple printk in proc_lookup_de() shows that. Now that ->delete hook intelligently picks which dentries should live in dcache and which should not, rbtree caching is not necessary as dcache does it job, at last! As a side effect, struct proc_dir_entry shrinks by one pointer which can go into inline name. Link: http://lkml.kernel.org/r/20180314231032.GA15854@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Davidlohr Bueso Cc: Peter Zijlstra Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 26 ++++++++++++-------------- fs/proc/internal.h | 6 +++--- fs/proc/proc_net.c | 2 +- fs/proc/root.c | 2 +- 4 files changed, 17 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index fc0333fd5676..04c4804cbdef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -52,8 +52,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first_cached(&dir->subdir), - struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -66,7 +66,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_root.rb_node; + struct rb_node *node = dir->subdir.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -87,9 +87,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root_cached *root = &dir->subdir; - struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; - bool leftmost = true; + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; /* Figure out where to put new node */ while (*new) { @@ -101,16 +100,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) { + else if (result > 0) new = &(*new)->rb_right; - leftmost = false; - } else + else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color_cached(&de->subdir_node, root, leftmost); + rb_insert_color(&de->subdir_node, root); return true; } @@ -401,7 +399,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT_CACHED; + ent->subdir = RB_ROOT; refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -577,7 +575,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase_cached(&de->subdir_node, &parent->subdir); + rb_erase(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -614,13 +612,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase_cached(&root->subdir_node, &parent->subdir); + rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase_cached(&next->subdir_node, &de->subdir); + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index b7024f174778..0f1692e63cb6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -51,15 +51,15 @@ struct proc_dir_entry { kgid_t gid; loff_t size; struct proc_dir_entry *parent; - struct rb_root_cached subdir; + struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 namelen; #ifdef CONFIG_64BIT -#define SIZEOF_PDE_INLINE_NAME (192-147) +#define SIZEOF_PDE_INLINE_NAME (192-139) #else -#define SIZEOF_PDE_INLINE_NAME (128-91) +#define SIZEOF_PDE_INLINE_NAME (128-87) #endif char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index e5fe3d400737..1763f370489d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -196,7 +196,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; - netd->subdir = RB_ROOT_CACHED; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index 76c996457ff9..61b7340b357a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -203,7 +203,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT_CACHED, + .subdir = RB_ROOT, .name = proc_root.inline_name, .inline_name = "/proc", }; -- cgit v1.2.1 From ad12c3a6ef1c78d0d0dbbe48dfcd416583f515ad Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:34:37 -0700 Subject: autofs4: use wait_event_killable This playing with signals to allow only fatal signals appears to predate the introduction of wait_event_killable(), and I'm fairly sure that wait_event_killable is what was meant to happen here. [avagin@openvz.org: use wake_up() instead of wake_up_interruptible] Link: http://lkml.kernel.org/r/20180331022839.21277-1-avagin@openvz.org Link: http://lkml.kernel.org/r/20180319191609.23880-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Ian Kent Cc: Matthew Wilcox Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index a0c57c37fa21..be9c3dc048ab 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -19,9 +19,6 @@ */ static autofs_wqt_t autofs4_next_wait_queue = 1; -/* These are the signals we allow interrupting a pending mount */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) - void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ - if (wq->name.name) { - /* Block all but "shutdown" signals while waiting */ - unsigned long shutdown_sigs_mask; - unsigned long irqflags; - sigset_t oldset; - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - oldset = current->blocked; - shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; - siginitsetinv(¤t->blocked, shutdown_sigs_mask); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - - wait_event_interruptible(wq->queue, wq->name.name == NULL); - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - } else { - pr_debug("skipped sleeping\n"); - } - + wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* @@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok kfree(wq->name.name); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); -- cgit v1.2.1 From 9ad553abe66f8be3f4755e9fa0a6ba137ce76341 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 10 Apr 2018 16:34:41 -0700 Subject: fs/reiserfs/journal.c: add missing resierfs_warning() arg One use of the reiserfs_warning() macro in journal_init_dev() is missing a parameter, causing the following warning: REISERFS warning (device loop0): journal_init_dev: Cannot open '%s': %i journal_init_dev: This also causes a WARN_ONCE() warning in the vsprintf code, and then a panic if panic_on_warn is set. Please remove unsupported %/ in format string WARNING: CPU: 1 PID: 4480 at lib/vsprintf.c:2138 format_decode+0x77f/0x830 lib/vsprintf.c:2138 Kernel panic - not syncing: panic_on_warn set ... Just add another string argument to the macro invocation. Addresses https://syzkaller.appspot.com/bug?id=0627d4551fdc39bf1ef5d82cd9eef587047f7718 Link: http://lkml.kernel.org/r/d678ebe1-6f54-8090-df4c-b9affad62293@infradead.org Signed-off-by: Randy Dunlap Reported-by: Tested-by: Randy Dunlap Acked-by: Jeff Mahoney Cc: Alexander Viro Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 70057359fbaf..23148c3ed675 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super, if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning(super, + reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; -- cgit v1.2.1 From 0965232035cfa59a64d197cf8a8ee0bc407bb3e4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:34:45 -0700 Subject: seq_file: allocate seq_file from kmem_cache For fine-grained debugging and usercopy protection. Link: http://lkml.kernel.org/r/20180310085027.GA17121@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Al Viro Cc: Glauber Costa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 0677e89f3c6f..3cb340583074 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,7 @@ * initial implementation -- AV, Oct 2001. */ +#include #include #include #include @@ -19,6 +20,8 @@ #include #include +static struct kmem_cache *seq_file_cache __ro_after_init; + static void seq_set_overflow(struct seq_file *m) { m->count = m->size; @@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op) WARN_ON(file->private_data); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); - kfree(m); + kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); @@ -1106,3 +1109,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); + +void __init seq_file_init(void) +{ + seq_file_cache = KMEM_CACHE(seq_file, SLAB_PANIC); +} -- cgit v1.2.1 From d64d01a155f84850f7dc9795f464e3df9a5ddb10 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:34:49 -0700 Subject: seq_file: account everything to kmemcg All it takes to open a file and read 1 byte from it. seq_file will be allocated along with any private allocations, and more importantly seq file buffer which is 1 page by default. Link: http://lkml.kernel.org/r/20180310085252.GB17121@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Al Viro Cc: Glauber Costa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 3cb340583074..c6c27f1f9c98 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -29,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - return kvmalloc(size, GFP_KERNEL); + return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** @@ -566,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v) int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { @@ -628,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; @@ -1112,5 +1112,5 @@ EXPORT_SYMBOL(seq_hlist_next_percpu); void __init seq_file_init(void) { - seq_file_cache = KMEM_CACHE(seq_file, SLAB_PANIC); + seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); } -- cgit v1.2.1 From 8f2af155b513583e8b149a384551f13e1ac5dc72 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:34:53 -0700 Subject: exec: pass stack rlimit into mm layout functions Patch series "exec: Pin stack limit during exec". Attempts to solve problems with the stack limit changing during exec continue to be frustrated[1][2]. In addition to the specific issues around the Stack Clash family of flaws, Andy Lutomirski pointed out[3] other places during exec where the stack limit is used and is assumed to be unchanging. Given the many places it gets used and the fact that it can be manipulated/raced via setrlimit() and prlimit(), I think the only way to handle this is to move away from the "current" view of the stack limit and instead attach it to the bprm, and plumb this down into the functions that need to know the stack limits. This series implements the approach. [1] 04e35f4495dd ("exec: avoid RLIMIT_STACK races with prlimit()") [2] 779f4e1c6c7c ("Revert "exec: avoid RLIMIT_STACK races with prlimit()"") [3] to security@kernel.org, "Subject: existing rlimit races?" This patch (of 3): Since it is possible that the stack rlimit can change externally during exec (either via another thread calling setrlimit() or another process calling prlimit()), provide a way to pass the rlimit down into the per-architecture mm layout functions so that the rlimit can stay in the bprm structure instead of sitting in the signal structure until exec is finalized. Link: http://lkml.kernel.org/r/1518638796-20819-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Michal Hocko Cc: Ben Hutchings Cc: Willy Tarreau Cc: Hugh Dickins Cc: Oleg Nesterov Cc: "Jason A. Donenfeld" Cc: Rik van Riel Cc: Laura Abbott Cc: Greg KH Cc: Andy Lutomirski Cc: Ben Hutchings Cc: Brad Spengler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a919a827d181..f4469ab88c7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1323,6 +1323,8 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { + struct rlimit rlim_stack; + /* * Once here, prepare_binrpm() will not be called any more, so * the final state of setuid/setgid/fscaps can be merged into the @@ -1345,7 +1347,11 @@ void setup_new_exec(struct linux_binprm * bprm) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; } - arch_pick_mmap_layout(current->mm); + task_lock(current->group_leader); + rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + + arch_pick_mmap_layout(current->mm, &rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; -- cgit v1.2.1 From b83838313386f617d6bd8201be7f5b532059bba1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:34:57 -0700 Subject: exec: introduce finalize_exec() before start_thread() Provide a final callback into fs/exec.c before start_thread() takes over, to handle any last-minute changes, like the coming restoration of the stack limit. Link: http://lkml.kernel.org/r/1518638796-20819-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Ben Hutchings Cc: Ben Hutchings Cc: Brad Spengler Cc: Greg KH Cc: Hugh Dickins Cc: "Jason A. Donenfeld" Cc: Laura Abbott Cc: Michal Hocko Cc: Oleg Nesterov Cc: Rik van Riel Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_aout.c | 1 + fs/binfmt_elf.c | 1 + fs/binfmt_elf_fdpic.c | 1 + fs/binfmt_flat.c | 1 + fs/exec.c | 6 ++++++ 5 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ce1824f47ba6..c3deb2e35f20 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -330,6 +330,7 @@ beyond_if: #ifdef __alpha__ regs->gp = ex.a_gpvalue; #endif + finalize_exec(bprm); start_thread(regs, ex.a_entry, current->mm->start_stack); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bdb201230bae..3edca6cb9a33 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1155,6 +1155,7 @@ static int load_elf_binary(struct linux_binprm *bprm) ELF_PLAT_INIT(regs, reloc_func_desc); #endif + finalize_exec(bprm); start_thread(regs, elf_entry, bprm->p); retval = 0; out: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 429326b6e2e7..d90993adeffa 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) dynaddr); #endif + finalize_exec(bprm); /* everything is now ready... get the userspace context ready to roll */ entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; start_thread(regs, entryaddr, current->mm->start_stack); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d6b94475f27..82a48e830018 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm) FLAT_PLAT_INIT(regs); #endif + finalize_exec(bprm); pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); diff --git a/fs/exec.c b/fs/exec.c index f4469ab88c7a..422ad79a7a03 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1384,6 +1384,12 @@ void setup_new_exec(struct linux_binprm * bprm) } EXPORT_SYMBOL(setup_new_exec); +/* Runs immediately before start_thread() takes over. */ +void finalize_exec(struct linux_binprm *bprm) +{ +} +EXPORT_SYMBOL(finalize_exec); + /* * Prepare credentials and lock ->cred_guard_mutex. * install_exec_creds() commits the new creds and drops the lock. -- cgit v1.2.1 From c31dbb146dd44af44bc60780ce8fa7a9f5f746df Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:35:01 -0700 Subject: exec: pin stack limit during exec Since the stack rlimit is used in multiple places during exec and it can be changed via other threads (via setrlimit()) or processes (via prlimit()), the assumption that the value doesn't change cannot be made. This leads to races with mm layout selection and argument size calculations. This changes the exec path to use the rlimit stored in bprm instead of in current. Before starting the thread, the bprm stack rlimit is stored back to current. Link: http://lkml.kernel.org/r/1518638796-20819-4-git-send-email-keescook@chromium.org Fixes: 64701dee4178e ("exec: Use sane stack rlimit under secureexec") Signed-off-by: Kees Cook Reported-by: Ben Hutchings Reported-by: Andy Lutomirski Reported-by: Brad Spengler Acked-by: Michal Hocko Cc: Ben Hutchings Cc: Greg KH Cc: Hugh Dickins Cc: "Jason A. Donenfeld" Cc: Laura Abbott Cc: Oleg Nesterov Cc: Rik van Riel Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 422ad79a7a03..183059c427b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ limit = _STK_LIM / 4 * 3; - limit = min(limit, rlimit(RLIMIT_STACK) / 4); + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); if (size > limit) goto fail; } @@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; + /* Save current stack limit for all calculations made during exec. */ + task_lock(current->group_leader); + bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + err = __bprm_mm_init(bprm); if (err) goto err; @@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = bprm->rlim_stack.rlim_max; if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm, * Align this down to a page boundary as expand_stack * will align it up. */ - rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; @@ -1323,8 +1328,6 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { - struct rlimit rlim_stack; - /* * Once here, prepare_binrpm() will not be called any more, so * the final state of setuid/setgid/fscaps can be merged into the @@ -1343,15 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm) * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ - if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (bprm->rlim_stack.rlim_cur > _STK_LIM) + bprm->rlim_stack.rlim_cur = _STK_LIM; } - task_lock(current->group_leader); - rlim_stack = current->signal->rlim[RLIMIT_STACK]; - task_unlock(current->group_leader); - - arch_pick_mmap_layout(current->mm, &rlim_stack); + arch_pick_mmap_layout(current->mm, &bprm->rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; @@ -1387,6 +1386,10 @@ EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { + /* Store any stack rlimit changes before starting thread. */ + task_lock(current->group_leader); + current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); -- cgit v1.2.1 From 64a11f3dc20b45fdc8c058296b4f6449e4b9f24c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Apr 2018 16:35:35 -0700 Subject: fs/proc/proc_sysctl.c: fix typo in sysctl_check_table_array() Patch series "ipc: Clamp *mni to the real IPCMNI limit", v3. The sysctl parameters msgmni, shmmni and semmni have an inherent limit of IPC_MNI (32k). However, users may not be aware of that because they can write a value much higher than that without getting any error or notification. Reading the parameters back will show the newly written values which are not real. Enforcing the limit by failing sysctl parameter write, however, can break existing user applications. To address this delemma, a new flags field is introduced into the ctl_table. The value CTL_FLAGS_CLAMP_RANGE can be added to any ctl_table entries to enable a looser range clamping without returning any error. For example, .flags = CTL_FLAGS_CLAMP_RANGE, This flags value are now used for the range checking of shmmni, msgmni and semmni without breaking existing applications. If any out of range value is written to those sysctl parameters, the following warning will be printed instead. Kernel parameter "shmmni" was set out of range [0, 32768], clamped to 32768. Reading the values back will show 32768 instead of some fake values. This patch (of 6): Fix a typo. Link: http://lkml.kernel.org/r/1519926220-7453-2-git-send-email-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Andrew Morton Acked-by: Luis R. Rodriguez Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Kees Cook Cc: Al Viro Cc: Matthew Wilcox Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4654fc3c246f..8989936f2995 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1086,7 +1086,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) - err |= sysctl_err(path, table, "array now allowed"); + err |= sysctl_err(path, table, "array not allowed"); } return err; -- cgit v1.2.1 From 32785c0539b7e96f77a14a4f4ab225712665a5a4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 10 Apr 2018 16:35:49 -0700 Subject: fs/dcache.c: add cond_resched() in shrink_dentry_list() As previously reported (https://patchwork.kernel.org/patch/8642031/) it's possible to call shrink_dentry_list with a large number of dentries (> 10000). This, in turn, could trigger the softlockup detector and possibly trigger a panic. In addition to the unmount path being vulnerable to this scenario, at SuSE we've observed similar situation happening during process exit on processes that touch a lot of dentries. Here is an excerpt from a crash dump. The number after the colon are the number of dentries on the list passed to shrink_dentry_list: PID 99760: 10722 PID 107530: 215 PID 108809: 24134 PID 108877: 21331 PID 141708: 16487 So we want to kill between 15k-25k dentries without yielding. And one possible call stack looks like: 4 [ffff8839ece41db0] _raw_spin_lock at ffffffff8152a5f8 5 [ffff8839ece41db0] evict at ffffffff811c3026 6 [ffff8839ece41dd0] __dentry_kill at ffffffff811bf258 7 [ffff8839ece41df0] shrink_dentry_list at ffffffff811bf593 8 [ffff8839ece41e18] shrink_dcache_parent at ffffffff811bf830 9 [ffff8839ece41e50] proc_flush_task at ffffffff8120dd61 10 [ffff8839ece41ec0] release_task at ffffffff81059ebd 11 [ffff8839ece41f08] do_exit at ffffffff8105b8ce 12 [ffff8839ece41f78] sys_exit at ffffffff8105bd53 13 [ffff8839ece41f80] system_call_fastpath at ffffffff81532909 While some of the callers of shrink_dentry_list do use cond_resched, this is not sufficient to prevent softlockups. So just move cond_resched into shrink_dentry_list from its callers. David said: I've found hundreds of occurrences of warnings that we emit when need_resched stays set for a prolonged period of time with the stack trace that is included in the change log. Link: http://lkml.kernel.org/r/1521718946-31521-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Alexander Viro Cc: Goldwyn Rodrigues Cc: Jeff Mahoney Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 915816e90049..86d2de63461e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1052,6 +1052,8 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; + cond_resched(); + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1205,7 +1207,6 @@ void shrink_dcache_sb(struct super_block *sb) this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1487,7 +1488,6 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(&data.dispose); - cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1614,7 +1614,6 @@ void d_invalidate(struct dentry *dentry) detach_mounts(data.mountpoint); dput(data.mountpoint); } - cond_resched(); } } EXPORT_SYMBOL(d_invalidate); -- cgit v1.2.1 From 4ed28639519c7bad5f518e70b3284c6e0763e650 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:36:01 -0700 Subject: fs, elf: drop MAP_FIXED usage from elf_map Both load_elf_interp and load_elf_binary rely on elf_map to map segments on a controlled address and they use MAP_FIXED to enforce that. This is however dangerous thing prone to silent data corruption which can be even exploitable. Let's take CVE-2017-1000253 as an example. At the time (before commit eab09532d400: "binfmt_elf: use ELF_ET_DYN_BASE only for PIE") ELF_ET_DYN_BASE was at TASK_SIZE / 3 * 2 which is not that far away from the stack top on 32b (legacy) memory layout (only 1GB away). Therefore we could end up mapping over the existing stack with some luck. The issue has been fixed since then (a87938b2e246: "fs/binfmt_elf.c: fix bug in loading of PIE binaries"), ELF_ET_DYN_BASE moved moved much further from the stack (eab09532d400 and later by c715b72c1ba4: "mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes") and excessive stack consumption early during execve fully stopped by da029c11e6b1 ("exec: Limit arg stack to at most 75% of _STK_LIM"). So we should be safe and any attack should be impractical. On the other hand this is just too subtle assumption so it can break quite easily and hard to spot. I believe that the MAP_FIXED usage in load_elf_binary (et. al) is still fundamentally dangerous. Moreover it shouldn't be even needed. We are at the early process stage and so there shouldn't be unrelated mappings (except for stack and loader) existing so mmap for a given address should succeed even without MAP_FIXED. Something is terribly wrong if this is not the case and we should rather fail than silently corrupt the underlying mapping. Address this issue by changing MAP_FIXED to the newly added MAP_FIXED_NOREPLACE. This will mean that mmap will fail if there is an existing mapping clashing with the requested one without clobbering it. [mhocko@suse.com: fix build] [akpm@linux-foundation.org: coding-style fixes] [avagin@openvz.org: don't use the same value for MAP_FIXED_NOREPLACE and MAP_SYNC] Link: http://lkml.kernel.org/r/20171218184916.24445-1-avagin@openvz.org Link: http://lkml.kernel.org/r/20171213092550.2774-3-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Andrei Vagin Signed-off-by: Michal Hocko Reviewed-by: Khalid Aziz Acked-by: Michael Ellerman Acked-by: Kees Cook Cc: Abdul Haleem Cc: Joel Stanley Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3edca6cb9a33..46f0438088d3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, + (void *)addr); + return(map_addr); } @@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED; + elf_type |= MAP_FIXED_NOREPLACE; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -939,7 +944,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -975,7 +980,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else load_bias = 0; @@ -1235,7 +1240,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) -- cgit v1.2.1 From ad55eac74f2016c6dc132b9502f794156858a3d1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Apr 2018 16:36:05 -0700 Subject: elf: enforce MAP_FIXED on overlaying elf segments Anshuman has reported that with "fs, elf: drop MAP_FIXED usage from elf_map" applied, some ELF binaries in his environment fail to start with [ 23.423642] 9148 (sed): Uhuuh, elf segment at 0000000010030000 requested but the memory is mapped already [ 23.423706] requested [10030000, 10040000] mapped [10030000, 10040000] 100073 anon The reason is that the above binary has overlapping elf segments: LOAD 0x0000000000000000 0x0000000010000000 0x0000000010000000 0x0000000000013a8c 0x0000000000013a8c R E 10000 LOAD 0x000000000001fd40 0x000000001002fd40 0x000000001002fd40 0x00000000000002c0 0x00000000000005e8 RW 10000 LOAD 0x0000000000020328 0x0000000010030328 0x0000000010030328 0x0000000000000384 0x00000000000094a0 RW 10000 That binary has two RW LOAD segments, the first crosses a page border into the second 0x1002fd40 (LOAD2-vaddr) + 0x5e8 (LOAD2-memlen) == 0x10030328 (LOAD3-vaddr) Handle this situation by enforcing MAP_FIXED when we establish a temporary brk VMA to handle overlapping segments. All other mappings will still use MAP_FIXED_NOREPLACE. Link: http://lkml.kernel.org/r/20180213100440.GM3443@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Anshuman Khandual Reviewed-by: Khalid Aziz Cc: Andrei Vagin Cc: Michael Ellerman Cc: Kees Cook Cc: Abdul Haleem Cc: Joel Stanley Cc: Stephen Rothwell Cc: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 46f0438088d3..41e04183e4ce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -895,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm) the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags; + int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -927,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm) */ } } + + /* + * Some binaries have overlapping elf segments and then + * we have to forcefully map over an existing mapping + * e.g. over this newly established brk mapping. + */ + elf_fixed = MAP_FIXED; } if (elf_ppnt->p_flags & PF_R) @@ -944,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED_NOREPLACE; + elf_flags |= elf_fixed; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -980,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED_NOREPLACE; + elf_flags |= elf_fixed; } else load_bias = 0; -- cgit v1.2.1 From f82b376413298ddd39a2391e38260c15cdebf380 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:44 -0700 Subject: export __set_page_dirty XFS currently contains a copy-and-paste of __set_page_dirty(). Export it from buffer.c instead. Link: http://lkml.kernel.org/r/20180313132639.17387-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jeff Layton Reviewed-by: Darrick J. Wong Cc: Ryusuke Konishi Cc: Dave Chinner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 3 ++- fs/xfs/xfs_aops.c | 15 ++------------- 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index ec5dd39071e6..64b1e2065b6b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -594,7 +594,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * The caller must hold lock_page_memcg(). */ -static void __set_page_dirty(struct page *page, struct address_space *mapping, +void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { unsigned long flags; @@ -608,6 +608,7 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping, } spin_unlock_irqrestore(&mapping->tree_lock, flags); } +EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 436a1de3fcdf..0ab824f574ed 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1467,19 +1467,8 @@ xfs_vm_set_page_dirty( newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - if (newly_dirty) { - /* sigh - __set_page_dirty() is static, so copy it here, too */ - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } + if (newly_dirty) + __set_page_dirty(page, mapping, 1); unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -- cgit v1.2.1 From e5a955419642e0842fd26e1ada6ab3328018ca16 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:48 -0700 Subject: fscache: use appropriate radix tree accessors Don't open-code accesses to data structure internals. Link: http://lkml.kernel.org/r/20180313132639.17387-7-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fscache/cookie.c | 2 +- fs/fscache/object.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 7dc55b93a830..97137d7ec5ee 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -832,7 +832,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - BUG_ON(cookie->stores.rnode); + BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 1085ca12e25c..20e0d0a4dc8c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -973,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj * retire the object instead. */ if (!fscache_use_cookie(object)) { - ASSERT(object->cookie->stores.rnode == NULL); + ASSERT(radix_tree_empty(&object->cookie->stores)); set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); -- cgit v1.2.1 From f6bb2a2c0b81c47282ddb7883f92e65a063c27dd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:52 -0700 Subject: xarray: add the xa_lock to the radix_tree_root This results in no change in structure size on 64-bit machines as it fits in the padding between the gfp_t and the void *. 32-bit machines will grow the structure from 8 to 12 bytes. Almost all radix trees are protected with (at least) a spinlock, so as they are converted from radix trees to xarrays, the data structures will shrink again. Initialising the spinlock requires a name for the benefit of lockdep, so RADIX_TREE_INIT() now needs to know the name of the radix tree it's initialising, and so do IDR_INIT() and IDA_INIT(). Also add the xa_lock() and xa_unlock() family of wrappers to make it easier to use the lock. If we could rely on -fplan9-extensions in the compiler, we could avoid all of this syntactic sugar, but that wasn't added until gcc 4.6. Link: http://lkml.kernel.org/r/20180313132639.17387-8-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bfb7a4a3a929..9327411fd93b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(GFP_NOFS), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; trace_f2fs_gc_begin(sbi->sb, sync, background, -- cgit v1.2.1 From b93b016313b3ba8003c3b8bb71f569af91f19fc7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:56 -0700 Subject: page cache: use xa_lock Remove the address_space ->tree_lock and use the xa_lock newly added to the radix_tree_root. Rename the address_space ->page_tree to ->i_pages, since we don't really care that it's a tree. [willy@infradead.org: fix nds32, fs/dax.c] Link: http://lkml.kernel.org/r/20180406145415.GB20605@bombadil.infradead.orgLink: http://lkml.kernel.org/r/20180313132639.17387-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/write.c | 9 ++-- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 16 +++---- fs/buffer.c | 13 +++--- fs/cifs/file.c | 9 ++-- fs/dax.c | 124 ++++++++++++++++++++++++------------------------- fs/f2fs/data.c | 6 +-- fs/f2fs/dir.c | 6 +-- fs/f2fs/inline.c | 6 +-- fs/f2fs/node.c | 8 ++-- fs/fs-writeback.c | 22 ++++----- fs/inode.c | 11 ++--- fs/nilfs2/btnode.c | 20 ++++---- fs/nilfs2/page.c | 22 ++++----- 14 files changed, 134 insertions(+), 140 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 9370e2feb999..dbc3c0b0142d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -570,10 +570,11 @@ static int afs_writepages_region(struct address_space *mapping, _debug("wback %lx", page->index); - /* at this point we hold neither mapping->tree_lock nor lock on - * the page itself: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled back from - * swapper_space to tmpfs file mapping + /* + * at this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ ret = lock_page_killable(page); if (ret < 0) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 562c3e633403..578181cd96b5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -458,7 +458,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); + page = radix_tree_lookup(&mapping->i_pages, pg_index); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) { misses++; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47a8fe9d22e8..cf87976e389d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3963,11 +3963,11 @@ retry: done_index = page->index; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor + * the page lock: the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to + * tmpfs file mapping */ if (!trylock_page(page)) { flush_write_bio(epd); @@ -5174,13 +5174,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!PagePrivate(page)); clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); + xa_lock_irq(&page->mapping->i_pages); if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, + radix_tree_tag_clear(&page->mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&page->mapping->tree_lock); + xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); } diff --git a/fs/buffer.c b/fs/buffer.c index 64b1e2065b6b..f3491074b035 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync); * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * - * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * Hack idea: for the blockdev mapping, private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->tree_lock). + * succeeds, there is no need to take private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -599,14 +598,14 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } EXPORT_SYMBOL_GPL(__set_page_dirty); @@ -1096,7 +1095,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and mapping->host->i_lock. + * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7cee97b93a61..4bcd4e838b47 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, for (i = 0; i < found_pages; i++) { page = wdata->pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ if (nr_pages == 0) diff --git a/fs/dax.c b/fs/dax.c index a77394fe586e..aaec72ded1b6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -158,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. + * @entry may no longer be the entry at the index in the mapping. + * The important information it's conveying is whether the entry at + * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) @@ -174,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, /* * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. + * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ @@ -183,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. The function must be called with - * mapping->tree_lock held + * Check whether the given slot is locked. Must be called with the i_pages + * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); return entry & RADIX_DAX_ENTRY_LOCK; } /* - * Mark the given slot is locked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } /* - * Mark the given slot is unlocked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } @@ -228,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * - * The function must be called with mapping->tree_lock held. + * Must be called with the i_pages lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) @@ -241,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || @@ -254,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); schedule(); finish_wait(wq, &ewait.wait); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } } @@ -266,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, { void *entry, **slot; - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + xa_lock_irq(&mapping->i_pages); + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -388,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry, **slot; restart: - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { @@ -420,12 +416,12 @@ restart: if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop - * mapping->tree_lock. + * the i_pages lock. */ entry = lock_slot(mapping, slot); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be @@ -442,27 +438,27 @@ restart: put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (!entry) { /* - * We needed to drop the page_tree lock while calling + * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ - entry = __radix_tree_lookup(&mapping->page_tree, index, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); goto restart; } } if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->page_tree, index); + radix_tree_delete(&mapping->i_pages, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); @@ -470,11 +466,11 @@ restart: entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - err = __radix_tree_insert(&mapping->page_tree, index, + err = __radix_tree_insert(&mapping->i_pages, index, dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it @@ -487,12 +483,12 @@ restart: } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } @@ -501,23 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, { int ret = 0; void *entry; - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && - (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(page_tree, index); + radix_tree_delete(pages, index); mapping->nrexceptional--; ret = 1; out: put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } /* @@ -587,7 +583,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; void *new_entry; @@ -604,7 +600,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); new_entry = dax_radix_locked_entry(pfn, flags); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); @@ -624,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, &node, &slot); + ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, + __radix_tree_replace(pages, node, slot, new_entry, NULL); entry = new_entry; } if (dirty) - radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return entry; } @@ -723,7 +719,7 @@ unlock_pte: static int dax_writeback_one(struct dax_device *dax_dev, struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; void *entry2, **slot; unsigned long pfn; long ret = 0; @@ -736,7 +732,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, if (WARN_ON(!radix_tree_exceptional_entry(entry))) return -EIO; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) @@ -755,7 +751,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, } /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; /* Lock the entry to serialize with page faults */ entry = lock_slot(mapping, slot); @@ -763,11 +759,11 @@ static int dax_writeback_one(struct dax_device *dax_dev, * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look - * at the entry only under tree_lock and once they do that they will - * see the entry locked and wait for it to unlock. + * at the entry only under the i_pages lock and once they do that + * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); - spin_unlock_irq(&mapping->tree_lock); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); + xa_unlock_irq(pages); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -787,16 +783,16 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(pages); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); + xa_unlock_irq(pages); trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); put_locked_mapping_entry(mapping, index); return ret; put_unlocked: put_unlocked_mapping_entry(mapping, index, entry2); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } @@ -1566,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, pgoff_t index = vmf->pgoff; int vmf_ret, error; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); /* Did we race with someone splitting entry or so? */ if (!entry || (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index db50686f5096..02237d4d91f5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page) SetPageDirty(page); spin_unlock(&mapping->private_lock); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); WARN_ON_ONCE(!PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fe661274ff10..8c9c2f31b253 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); ClearPagePrivate(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b77d6421218..265da200daa8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9a99243054ba..f202398e20ea 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page) unsigned int long flags; if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); @@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, check_nid_range(sbi, nid)); rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); rcu_read_unlock(); if (apage) return; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1280f915079b..4b12ba70a895 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against mapping->tree_lock. + * synchronizing against the i_pages lock. * - * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ @@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING is visible under i_lock, the eviction path owns @@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under underwriteback. + * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page) && PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); @@ -430,7 +430,7 @@ skip_switch: */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); @@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the mapping's - * tree_lock so that stat transfer can synchronize against them. + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); diff --git a/fs/inode.c b/fs/inode.c index b153aeaa61ea..13ceb98c3bd3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&mapping->tree_lock); + INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); @@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { /* - * We have to cycle tree_lock here because reclaim can be still in the + * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __delete_from_page_cache()) - * and we must not free mapping under it. + * and we must not free the mapping under it. */ - spin_lock_irq(&inode->i_data.tree_lock); + xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); BUG_ON(inode->i_data.nrexceptional); - spin_unlock_irq(&inode->i_data.tree_lock); + xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c21e0b4454a6..dec98cab729d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -193,9 +193,9 @@ retry: (unsigned long long)oldkey, (unsigned long long)newkey); - spin_lock_irq(&btnc->tree_lock); - err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. @@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, (unsigned long long)newkey); mark_buffer_dirty(obh); - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, oldkey); - radix_tree_tag_set(&btnc->page_tree, newkey, + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, oldkey); + radix_tree_tag_set(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&btnc->tree_lock); + xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; unlock_page(opage); @@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, newkey); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, newkey); + xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 68241512d7c1..4cb850a6f1c2 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -331,15 +331,15 @@ repeat: struct page *page2; /* move the page to the destination cache */ - spin_lock_irq(&smap->tree_lock); - page2 = radix_tree_delete(&smap->page_tree, offset); + xa_lock_irq(&smap->i_pages); + page2 = radix_tree_delete(&smap->i_pages, offset); WARN_ON(page2 != page); smap->nrpages--; - spin_unlock_irq(&smap->tree_lock); + xa_unlock_irq(&smap->i_pages); - spin_lock_irq(&dmap->tree_lock); - err = radix_tree_insert(&dmap->page_tree, offset, page); + xa_lock_irq(&dmap->i_pages); + err = radix_tree_insert(&dmap->i_pages, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; @@ -348,11 +348,11 @@ repeat: page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->page_tree, + radix_tree_tag_set(&dmap->i_pages, offset, PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&dmap->tree_lock); + xa_unlock_irq(&dmap->i_pages); } unlock_page(page); } @@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page) struct address_space *mapping = page->mapping; if (mapping) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->page_tree, + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return 0; } return TestClearPageDirty(page); -- cgit v1.2.1