From a433658c30974fc87ba3ff52d7e4e6299762aa3d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 15 Jun 2011 15:08:13 -0700 Subject: vmscan,memcg: memcg aware swap token Currently, memcg reclaim can disable swap token even if the swap token mm doesn't belong in its memory cgroup. It's slightly risky. If an admin creates very small mem-cgroup and silly guy runs contentious heavy memory pressure workload, every tasks are going to lose swap token and then system may become unresponsive. That's bad. This patch adds 'memcg' parameter into disable_swap_token(). and if the parameter doesn't match swap token, VM doesn't disable it. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3ad..e37c44d88d46 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; @@ -5414,18 +5414,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *old_cont, struct task_struct *p) { - struct mm_struct *mm; + struct mm_struct *mm = get_task_mm(p); - if (!mc.to) - /* no need to move charge */ - return; - - mm = get_task_mm(p); if (mm) { - mem_cgroup_move_charge(mm); + if (mc.to) + mem_cgroup_move_charge(mm); + put_swap_token(mm); mmput(mm); } - mem_cgroup_clear_mc(); + if (mc.to) + mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, -- cgit v1.2.3 From 8957712710e045044e3c44375c6a87d7ffa17d51 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:41 -0700 Subject: mm: memory.numa_stat: fix file permission Commit 406eb0c9ba76 ("memcg: add memory.numastat api for numa statistics") adds memory.numa_stat file for memory cgroup. But the file permissions are wrong. [kamezawa@bluextal linux-2.6]$ ls -l /cgroup/memory/A/memory.numa_stat ---------- 1 root root 0 Jun 9 18:36 /cgroup/memory/A/memory.numa_stat This patch fixes the permission as [root@bluextal kamezawa]# ls -l /cgroup/memory/A/memory.numa_stat -r--r--r-- 1 root root 0 Jun 10 16:49 /cgroup/memory/A/memory.numa_stat Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e37c44d88d46..02a7947608ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4640,6 +4640,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "numa_stat", .open = mem_control_numa_stat_open, + .mode = S_IRUGO, }, #endif }; -- cgit v1.2.3 From 7ae534d074e01e54d5cfbc9734b73fdfc855501f Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:44 -0700 Subject: memcg: fix wrong check of noswap with softlimit Hierarchical reclaim doesn't swap out if memsw and resource limits are thye same (memsw_is_minimum == true) because we would hit mem+swap limit anyway (during hard limit reclaim). If it comes to the soft limit we shouldn't consider memsw_is_minimum at all because it doesn't make much sense. Either the soft limit is bellow the hard limit and then we cannot hit mem+swap limit or the direct reclaim takes a precedence. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 02a7947608ad..0b1a32cbd74d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1663,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (root_mem->memsw_is_minimum) + if (!check_soft && root_mem->memsw_is_minimum) noswap = true; while (1) { -- cgit v1.2.3 From 26fe616844491a41a1abc02e29f7a9d1ec2f8ddb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:45 -0700 Subject: memcg: fix percpu cached charge draining frequency For performance, memory cgroup caches some "charge" from res_counter into per cpu cache. This works well but because it's cache, it needs to be flushed in some cases. Typical cases are 1. when someone hit limit. 2. when rmdir() is called and need to charges to be 0. But "1" has problem. Recently, with large SMP machines, we see many kworker runs because of flushing memcg's cache. Bad things in implementation are that even if a cpu contains a cache for memcg not related to a memcg which hits limit, drain code is called. This patch does A) check percpu cache contains a useful data or not. B) check other asynchronous percpu draining doesn't run. C) don't call local cpu callback. (*)This patch avoid changing the calling condition with hard-limit. When I run "cat 1Gfile > /dev/null" under 300M limit memcg, [Before] 13767 kamezawa 20 0 98.6m 424 416 D 10.0 0.0 0:00.61 cat 58 root 20 0 0 0 0 S 0.6 0.0 0:00.09 kworker/2:1 60 root 20 0 0 0 0 S 0.6 0.0 0:00.08 kworker/4:1 4 root 20 0 0 0 0 S 0.3 0.0 0:00.02 kworker/0:0 57 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/1:1 61 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/5:1 62 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/6:1 63 root 20 0 0 0 0 S 0.3 0.0 0:00.05 kworker/7:1 [After] 2676 root 20 0 98.6m 416 416 D 9.3 0.0 0:00.87 cat 2626 kamezawa 20 0 15192 1312 920 R 0.3 0.0 0:00.28 top 1 root 20 0 19384 1496 1204 S 0.0 0.0 0:00.66 init 2 root 20 0 0 0 0 S 0.0 0.0 0:00.00 kthreadd 3 root 20 0 0 0 0 S 0.0 0.0 0:00.00 ksoftirqd/0 4 root 20 0 0 0 0 S 0.0 0.0 0:00.00 kworker/0:0 [akpm@linux-foundation.org: make percpu_charge_mutex static, tweak comments] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Reviewed-by: Michal Hocko Tested-by: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 16 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b1a32cbd74d..c39a177bb641 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -359,7 +359,7 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(void); +static void drain_all_stock_async(struct mem_cgroup *mem); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -1671,7 +1671,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (victim == root_mem) { loop++; if (loop >= 1) - drain_all_stock_async(); + drain_all_stock_async(root_mem); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1934,9 +1934,11 @@ struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; struct work_struct work; + unsigned long flags; +#define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static atomic_t memcg_drain_count; +static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -1984,6 +1986,7 @@ static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } /* @@ -2008,26 +2011,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) * expects some charges will be back to res_counter later but cannot wait for * it. */ -static void drain_all_stock_async(void) +static void drain_all_stock_async(struct mem_cgroup *root_mem) { - int cpu; - /* This function is for scheduling "drain" in asynchronous way. - * The result of "drain" is not directly handled by callers. Then, - * if someone is calling drain, we don't have to call drain more. - * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if - * there is a race. We just do loose check here. + int cpu, curcpu; + /* + * If someone calls draining, avoid adding more kworker runs. */ - if (atomic_read(&memcg_drain_count)) + if (!mutex_trylock(&percpu_charge_mutex)) return; /* Notify other cpus that system-wide "drain" is running */ - atomic_inc(&memcg_drain_count); get_online_cpus(); + /* + * Get a hint for avoiding draining charges on the current cpu, + * which must be exhausted by our charging. It is not required that + * this be a precise check, so we use raw_smp_processor_id() instead of + * getcpu()/putcpu(). + */ + curcpu = raw_smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - schedule_work_on(cpu, &stock->work); + struct mem_cgroup *mem; + + if (cpu == curcpu) + continue; + + mem = stock->cached; + if (!mem) + continue; + if (mem != root_mem) { + if (!root_mem->use_hierarchy) + continue; + /* check whether "mem" is under tree of "root_mem" */ + if (!css_is_ancestor(&mem->css, &root_mem->css)) + continue; + } + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + schedule_work_on(cpu, &stock->work); } put_online_cpus(); - atomic_dec(&memcg_drain_count); + mutex_unlock(&percpu_charge_mutex); /* We don't wait for flush_work */ } @@ -2035,9 +2057,9 @@ static void drain_all_stock_async(void) static void drain_all_stock_sync(void) { /* called when force_empty is called */ - atomic_inc(&memcg_drain_count); + mutex_lock(&percpu_charge_mutex); schedule_on_each_cpu(drain_local_stock); - atomic_dec(&memcg_drain_count); + mutex_unlock(&percpu_charge_mutex); } /* -- cgit v1.2.3 From fbc29a25e484be073e7d762c9f7f1d4bf8aecc48 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:46 -0700 Subject: memcg: avoid percpu cached charge draining at softlimit Based on Michal Hocko's comment. We are not draining per cpu cached charges during soft limit reclaim because background reclaim doesn't care about charges. It tries to free some memory and charges will not give any. Cached charges might influence only selection of the biggest soft limit offender but as the call is done only after the selection has been already done it makes no change. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c39a177bb641..cf7d027a8844 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1670,7 +1670,13 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { loop++; - if (loop >= 1) + /* + * We are not draining per cpu cached charges during + * soft limit reclaim because global reclaim doesn't + * care about charges. It tries to free some memory and + * charges will not give any. + */ + if (!check_soft && loop >= 1) drain_all_stock_async(root_mem); if (loop >= 2) { /* -- cgit v1.2.3