From 28b83c5193e7ab951e402252278f2cc79dc4d298 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:03:13 -0700 Subject: oom: move oom_adj value from task_struct to signal_struct Currently, OOM logic callflow is here. __out_of_memory() select_bad_process() for each task badness() calculate badness of one task oom_kill_process() search child oom_kill_task() kill target task and mm shared tasks with it example, process-A have two thread, thread-A and thread-B and it have very fat memory and each thread have following oom_adj and oom_score. thread-A: oom_adj = OOM_DISABLE, oom_score = 0 thread-B: oom_adj = 0, oom_score = very-high Then, select_bad_process() select thread-B, but oom_kill_task() refuse kill the task because thread-A have OOM_DISABLE. Thus __out_of_memory() call select_bad_process() again. but select_bad_process() select the same task. It mean kernel fall in livelock. The fact is, select_bad_process() must select killable task. otherwise OOM logic go into livelock. And root cause is, oom_adj shouldn't be per-thread value. it should be per-process value because OOM-killer kill a process, not thread. Thus This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct signal_struct. it naturally prevent select_bad_process() choose wrong task. Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6f742f6658a9..81cfff82875b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust; + int oom_adjust = OOM_DISABLE; + unsigned long flags; if (!task) return -ESRCH; - oom_adjust = task->oomkilladj; + + if (lock_task_sighand(task, &flags)) { + oom_adjust = task->signal->oom_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1017,6 +1023,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, struct task_struct *task; char buffer[PROC_NUMBUF], *end; int oom_adjust; + unsigned long flags; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) @@ -1032,11 +1039,20 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { + if (!lock_task_sighand(task, &flags)) { + put_task_struct(task); + return -ESRCH; + } + + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { + unlock_task_sighand(task, &flags); put_task_struct(task); return -EACCES; } - task->oomkilladj = oom_adjust; + + task->signal->oom_adj = oom_adjust; + + unlock_task_sighand(task, &flags); put_task_struct(task); if (end - buffer == 0) return -EIO; -- cgit v1.2.1 From 495789a51a91cb8c015d8d77fecbac1caf20b186 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:03:14 -0700 Subject: oom: make oom_score to per-process value oom-killer kills a process, not task. Then oom_score should be calculated as per-process too. it makes consistency more and makes speed up select_bad_process(). Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 81cfff82875b..71a34253dcbb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer) do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); - points = badness(task, uptime.tv_sec); + points = badness(task->group_leader, uptime.tv_sec); read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } -- cgit v1.2.1 From 5d863b89688e5811cd9e5bd0082cb38abe03adf3 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:03:16 -0700 Subject: oom: fix oom_adjust_write() input sanity check Andrew Morton pointed out oom_adjust_write() has very strange EIO and new line handling. this patch fixes it. Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 71a34253dcbb..55c4c805a756 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1021,21 +1021,24 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; - int oom_adjust; + char buffer[PROC_NUMBUF]; + long oom_adjust; unsigned long flags; + int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - oom_adjust = simple_strtol(buffer, &end, 0); + + err = strict_strtol(strstrip(buffer), 0, &oom_adjust); + if (err) + return -EINVAL; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; - if (*end == '\n') - end++; + task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; @@ -1054,9 +1057,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, unlock_task_sighand(task, &flags); put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; + + return count; } static const struct file_operations proc_oom_adjust_operations = { -- cgit v1.2.1 From cff4edb591c153a779a27a3fd8e7bc1217f2f6b8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Sep 2009 16:45:32 -0700 Subject: proc: fix reported unit for RLIMIT_CPU /proc/$pid/limits should show RLIMIT_CPU as seconds, which is the unit used in kernel/posix-cpu-timers.c: unsigned long psecs = cputime_to_secs(ptime); ... if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { ... __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); Signed-off-by: Kees Cook Acked-by: WANG Cong Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 55c4c805a756..69bb70351b9b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -458,7 +458,7 @@ struct limit_names { }; static const struct limit_names lnames[RLIM_NLIMITS] = { - [RLIMIT_CPU] = {"Max cpu time", "ms"}, + [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, -- cgit v1.2.1 From 9b4d1cbef8f41aff2b3e4ca31f566c071fe601de Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Sep 2009 16:45:34 -0700 Subject: proc_flush_task: flush /proc/tid/task/pid when a sub-thread exits The exiting sub-thread flushes /proc/pid only, but this doesn't buy too much: ps and friends mostly use /proc/tid/task/pid. Remove "if (thread_group_leader())" checks from proc_flush_task() path, this means we always remove /proc/tid/task/pid dentry on exit, and this actually matches the comment above proc_flush_task(). The test-case: static void* tfunc(void *arg) { char name[256]; sprintf(name, "/proc/%d/task/%ld/status", getpid(), gettid()); close(open(name, O_RDONLY)); return NULL; } int main(void) { pthread_t t; for (;;) { if (!pthread_create(&t, NULL, &tfunc, NULL)) pthread_join(t, NULL); } } slabtop shows that pid/proc_inode_cache/etc grow quickly and "indefinitely" until the task is killed or shrink_slab() is called, not good. And the main thread needs a lot of time to exit. The same can happen if something like "ps -efL" runs continuously, while some application spawns short-living threads. Reported-by: "James M. Leddy" Signed-off-by: Oleg Nesterov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Dominic Duval Cc: Frank Hirtz Cc: "Fuller, Johnray" Cc: Larry Woodman Cc: Paul Batkowski Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 69bb70351b9b..5bc587049b37 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2604,9 +2604,6 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) dput(dentry); } - if (tgid == 0) - goto out; - name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); @@ -2663,17 +2660,16 @@ out: void proc_flush_task(struct task_struct *task) { int i; - struct pid *pid, *tgid = NULL; + struct pid *pid, *tgid; struct upid *upid; pid = task_pid(task); - if (thread_group_leader(task)) - tgid = task_tgid(task); + tgid = task_tgid(task); for (i = 0; i <= pid->level; i++) { upid = &pid->numbers[i]; proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid ? tgid->numbers[i].nr : 0); + tgid->numbers[i].nr); } upid = &pid->numbers[pid->level]; -- cgit v1.2.1 From cba8aafe1e07dfc8bae5ba78be8e02883bd34d31 Mon Sep 17 00:00:00 2001 From: Vincent Li Date: Tue, 22 Sep 2009 16:45:38 -0700 Subject: fs/proc/base.c: fix proc_fault_inject_write() input sanity check Remove obfuscated zero-length input check and return -EINVAL instead of -EIO error to make the error message clear to user. Add whitespace stripping. No functionality changes. The old code: echo 1 > /proc/pid/make-it-fail (ok) echo 1foo > /proc/pid/make-it-fail (-bash: echo: write error: Input/output error) The new code: echo 1 > /proc/pid/make-it-fail (ok) echo 1foo > /proc/pid/make-it-fail (-bash: echo: write error: Invalid argument) This patch is conservative in changes to not breaking existing scripts/applications. Signed-off-by: Vincent Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5bc587049b37..837469a96598 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1187,17 +1187,16 @@ static ssize_t proc_fault_inject_write(struct file * file, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - make_it_fail = simple_strtol(buffer, &end, 0); - if (*end == '\n') - end++; + make_it_fail = simple_strtol(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; task = get_proc_task(file->f_dentry->d_inode); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; + + return count; } static const struct file_operations proc_fault_inject_operations = { -- cgit v1.2.1