From 8f89140ae41ccd9c63344e6823faa862aa7435e3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: minor updates around cgroup_clear_directory() * Rename it to cgroup_clear_dir() and make it take the pointer to the target cgroup instead of the the dentry. This makes the function consistent with its counterpart - cgroup_populate_dir(). * Move cgroup_clear_directory() invocation from cgroup_d_remove_dir() to cgroup_remount() so that the function doesn't have to determine the cgroup pointer back from the dentry. cgroup_d_remove_dir() now only deals with vfs, which is slightly cleaner. This patch doesn't introduce any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..09bfa870e698 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -957,15 +957,14 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_directory - selective removal of base and subsystem files - * @dir: directory containing the files + * cgroup_clear_dir - selective removal of base and subsystem files + * @cgrp: target cgroup * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_directory(struct dentry *dir, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { - struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; for_each_root_subsys(cgrp->root, ss) { @@ -987,9 +986,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - - cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1376,7 +1372,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); + cgroup_clear_dir(cgrp, false, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { @@ -4541,9 +4537,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* - * Remove @cgrp directory. The removal puts the base ref but we - * aren't quite done with @cgrp yet, so hold onto it. + * Clear and remove @cgrp directory. The removal puts the base ref + * but we aren't quite done with @cgrp yet, so hold onto it. */ + cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); dget(d); cgroup_d_remove_dir(d); -- cgit v1.2.1 From b1f28d3109349899e87377e89f9d8ab5bc95ec57 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: fix error path of cgroup_addrm_files() cgroup_addrm_files() mishandled error return value from cgroup_add_file() and returns error iff the last file fails to create. As we're in the process of cleaning up file add/rm error handling and will reliably propagate file creation failures, there's no point in keeping adding files after a failure. Replace the broken error collection logic with immediate error return. While at it, add lockdep assertions and function comment. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 09bfa870e698..9b16d75bec63 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2780,11 +2780,26 @@ out: return error; } +/** + * cgroup_addrm_files - add or remove files to a cgroup directory + * @cgrp: the target cgroup + * @subsys: the subsystem of files to be added + * @cfts: array of cftypes to be added + * @is_add: whether to add or remove + * + * Depending on @is_add, add or remove files defined by @cfts on @cgrp. + * All @cfts should belong to @subsys. For removals, this function never + * fails. If addition fails, this function doesn't remove files already + * added. The caller is responsible for cleaning up. + */ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add) { struct cftype *cft; - int err, ret = 0; + int ret; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2796,16 +2811,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - err = cgroup_add_file(cgrp, subsys, cft); - if (err) + ret = cgroup_add_file(cgrp, subsys, cft); + if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, err); - ret = err; + cft->name, ret); + return ret; + } } else { cgroup_rm_file(cgrp, cft); } } - return ret; + return 0; } static void cgroup_cfts_prepare(void) -- cgit v1.2.1 From 9ccece80ae19ed42439fc0ced76858f189cd41e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: fix cgroup_add_cftypes() error handling cgroup_add_cftypes() uses cgroup_cfts_commit() to actually create the files; however, both functions ignore actual file creation errors and just assume success. This can lead to, for example, blkio hierarchy with some of the cgroups with only subset of interface files populated after cfq-iosched is loaded under heavy memory pressure, which is nasty. This patch updates cgroup_cfts_commit() and cgroup_add_cftypes() to guarantee that all files are created on success and no file is created on failure. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9b16d75bec63..36c0ccc921f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2836,8 +2836,8 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static void cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cgroup_subsys *ss, + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); @@ -2846,12 +2846,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct dentry *prev = NULL; struct inode *inode; u64 update_before; + int ret = 0; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); - return; + return 0; } /* @@ -2867,10 +2868,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); + if (ret) + goto out_deact; + /* add/rm files for all cgroups created before */ rcu_read_lock(); cgroup_for_each_descendant_pre(cgrp, root) { @@ -2887,15 +2891,19 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); rcu_read_lock(); + if (ret) + break; } rcu_read_unlock(); dput(prev); +out_deact: deactivate_super(sb); + return ret; } /** @@ -2915,6 +2923,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) @@ -2923,9 +2932,10 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - cgroup_cfts_commit(ss, cfts, true); - - return 0; + ret = cgroup_cfts_commit(ss, cfts, true); + if (ret) + cgroup_rm_cftypes(ss, cfts); + return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); -- cgit v1.2.1 From 628f7cd47ab758cae0353d1a6decf3d1459dca24 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: separate out cgroup_base_files[] handling out of cgroup_populate/clear_dir() cgroup_populate/clear_dir() currently take @base_files and adds and removes, respectively, cgroup_base_files[] to the directory. File additions and removals are being reorganized for proper error handling and more dynamic handling for the unified hierarchy, and mixing base and subsys file handling into the same functions gets a bit confusing. This patch moves base file handling out of cgroup_populate/clear_dir() into their users - cgroup_mount(), cgroup_create() and cgroup_destroy_locked(). Note that this changes the behavior of base file removal. If @base_files is %true, cgroup_clear_dir() used to delete files regardless of cftype until there's no files left. Now, only files with matching cfts are removed. As files can only be created by the base or registered cftypes, this shouldn't result in any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36c0ccc921f4..9835a097f3c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -215,6 +215,8 @@ static u64 cgroup_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; +static struct cftype cgroup_base_files[]; + static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -804,8 +806,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -957,13 +958,11 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - selective removal of base and subsystem files + * cgroup_clear_dir - remove subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; @@ -974,10 +973,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); } - if (base_files) { - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); - } } /* @@ -1372,17 +1367,17 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_dir(cgrp, false, removed_mask); + cgroup_clear_dir(cgrp, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, false, removed_mask); + cgroup_populate_dir(cgrp, removed_mask); goto out_unlock; } /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_mask); + cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1687,7 +1682,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_mask); + cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + cgroup_populate_dir(root_cgrp, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -4172,23 +4168,14 @@ static struct cftype cgroup_base_files[] = { }; /** - * cgroup_populate_dir - selectively creation of files in a directory + * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be added * @subsys_mask: mask of the subsystem ids whose files should be added */ -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - int err; struct cgroup_subsys *ss; - if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); - if (err < 0) - return err; - } - /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { struct cftype_set *set; @@ -4410,7 +4397,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + if (err) + goto err_destroy; + + err = cgroup_populate_dir(cgrp, root->subsys_mask); if (err) goto err_destroy; @@ -4566,7 +4557,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. */ - cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); + cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); + cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit v1.2.1 From bee550994f6b0c1179bd3ccea58dc5c2c4ccf842 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: update error handling in cgroup_populate_dir() cgroup_populate_dir() didn't use to check whether the actual file creations were successful and could return success with only subset of the requested files created, which is nasty. This patch udpates cgroup_populate_dir() so that it either succeeds with all files or fails with no file. v2: The original patch also converted for_each_root_subsys() usages to for_each_subsys() without explaining why. That part has been moved to a separate patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9835a097f3c0..6b7324431b99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4171,10 +4171,13 @@ static struct cftype cgroup_base_files[] = { * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. */ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int ret = 0; /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { @@ -4182,8 +4185,11 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(ss->subsys_id, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, ss, set->cfts, true); + list_for_each_entry(set, &ss->cftsets, node) { + ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + if (ret < 0) + goto err; + } } /* This cgroup is ready now */ @@ -4201,6 +4207,9 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) } return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; } static void css_dput_fn(struct work_struct *work) -- cgit v1.2.1 From b420ba7db15659253d4f286a0ba479d336371999 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 12:34:02 -0700 Subject: cgroup: use for_each_subsys() instead of for_each_root_subsys() in cgroup_populate/clear_dir() rebind_subsystems() will be updated to handle file creations and removals with proper error handling and to do that will need to perform file operations before actually adding the subsystem to the hierarchy. To enable such usage, update cgroup_populate/clear_dir() to use for_each_subsys() instead of for_each_root_subsys() so that they operate on all subsystems specified by @subsys_mask whether that subsystem is currently bound to the hierarchy or not. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6b7324431b99..8f70dc0c0c79 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,10 +965,12 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int i; - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); @@ -4177,12 +4179,13 @@ static struct cftype cgroup_base_files[] = { static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; - int ret = 0; + int i, ret = 0; /* process cftsets of each subsystem */ - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) { -- cgit v1.2.1 From 3126121fb30941552b1a806c7c2e686bde57e270 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: make rebind_subsystems() handle file additions and removals with proper error handling Currently, creating and removing cgroup files in the root directory are handled separately from the actual subsystem binding and unbinding which happens in rebind_subsystems(). Also, rebind_subsystems() users aren't handling file creation errors properly. Let's integrate top_cgroup file handling into rebind_subsystems() so that it's simpler to use and everyone handles file creation errors correctly. * On a successful return, rebind_subsystems() is guaranteed to have created all files of the new subsystems and deleted the ones belonging to the removed subsystems. After a failure, no file is created or removed. * cgroup_remount() no longer needs to make explicit populate/clear calls as it's all handled by rebind_subsystems(), and it gets proper error handling automatically. * cgroup_mount() has been updated such that the root dentry and cgroup are linked before rebind_subsystems(). Also, the init_cred dancing and base file handling are moved right above rebind_subsystems() call and proper error handling for the base files is added. While at it, add a comment explaining what's going on with the cred thing. * cgroup_kill_sb() calls rebind_subsystems() to unbind all subsystems which now implies removing all subsystem files which requires the directory's i_mutex. Grab it. This means that files on the root cgroup are removed earlier - they used to be deleted from generic super_block cleanup from vfs. This doesn't lead to any functional difference and it's cleaner to do the clean up explicitly for all files. Combined with the previous changes, this makes all cgroup file creation errors handled correctly. v2: Added comment on init_cred. v3: Li spotted that cgroup_mount() wasn't freeing tmp_links after base file addition failure. Fix it by adding free_tmp_links error handling label. v4: v3 introduced build bugs which got noticed by Fengguang's awesome kbuild test robot. Fixed, and shame on me. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Fengguang Wu --- kernel/cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 32 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8f70dc0c0c79..4ec8d2da94d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,7 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - int i; + int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); @@ -1028,7 +1028,16 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (root->number_of_cgroups > 1) return -EBUSY; - /* Process each subsystem */ + ret = cgroup_populate_dir(cgrp, added_mask); + if (ret) + return ret; + + /* + * Nothing can fail from this point on. Remove files for the + * removed subsystems and rebind each subsystem. + */ + cgroup_clear_dir(cgrp, removed_mask); + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -1364,22 +1373,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* - * Clear out the files of subsystems that should be removed, do - * this before rebind_subsystems, since rebind_subsystems may - * change this hierarchy's subsys_list. - */ - cgroup_clear_dir(cgrp, removed_mask); - ret = rebind_subsystems(root, added_mask, removed_mask); - if (ret) { - /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, removed_mask); + if (ret) goto out_unlock; - } - - /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1578,7 +1574,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct list_head tmp_links; struct inode *inode; + const struct cred *cred; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1610,10 +1608,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; - const struct cred *cred; int i; struct css_set *cset; @@ -1651,26 +1647,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to + * create the root files, which doesn't work if SELinux is + * in use. The following cred dancing somehow works around + * it. See 2ce9738ba ("cgroupfs: use init_cred when + * populating new cgroupfs mount") for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + if (ret) + goto rm_base_files; + ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret == -EBUSY) { - free_cgrp_cset_links(&tmp_links); - goto unlock_drop; - } + if (ret) + goto rm_base_files; + + revert_creds(cred); + /* * There must be no failure case after here, since rebinding * takes care of subsystems' refcounts, which are explicitly * dropped in the failure exit path. */ - /* EBUSY should be the only error here */ - BUG_ON(ret); - list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; - /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); @@ -1683,10 +1690,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cred = override_creds(&init_cred); - cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); - cgroup_populate_dir(root_cgrp, root->subsys_mask); - revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1715,6 +1718,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + rm_base_files: + free_cgrp_cset_links(&tmp_links); + cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); @@ -1741,6 +1748,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1773,6 +1781,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); -- cgit v1.2.1 From f172e67cf9d842bc646d0f66792e38435a334b1e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: move number_of_cgroups test out of rebind_subsystems() into cgroup_remount() rebind_subsystems() currently fails if the hierarchy has any !root cgroups; however, on the planned unified hierarchy, rebind_subsystems() will be used while populated. Move the test to cgroup_remount(), which is the only place the test is necessary anyway. As it's impossible for the other two callers of rebind_subsystems() to have populated hierarchy, this doesn't make any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ec8d2da94d1..c108d3d1ea30 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1021,13 +1021,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; - ret = cgroup_populate_dir(cgrp, added_mask); if (ret) return ret; @@ -1373,6 +1366,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* remounting is not allowed for populated hierarchies */ + if (root->number_of_cgroups > 1) { + ret = -EBUSY; + goto out_unlock; + } + ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) goto out_unlock; -- cgit v1.2.1 From 1d5be6b287c8efc879fbe578e2b7bc8f7a38f313 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 13:38:17 -0700 Subject: cgroup: move module ref handling into rebind_subsystems() Module ref handling in cgroup is rather weird. parse_cgroupfs_options() grabs all the modules for the specified subsystems. A module ref is kept if the specified subsystem is newly bound to the hierarchy. If not, or the operation fails, the refs are dropped. This scatters module ref handling across multiple functions making it difficult to track. It also make the function nasty to use for dynamic subsystem binding which is necessary for the planned unified hierarchy. There's nothing which requires the subsystem modules to be pinned between parse_cgroupfs_options() and rebind_subsystems() in both mount and remount paths. parse_cgroupfs_options() can just parse and rebind_subsystems() can handle pinning the subsystems that it wants to bind, which is a natural part of its task - binding - anyway. Move module ref handling into rebind_subsystems() which makes the code a lot simpler - modules are gotten iff it's gonna be bound and put iff unbound or binding fails. v2: Li pointed out that if a controller module is unloaded between parsing and binding, rebind_subsystems() won't notice the missing controller as it only iterates through existing controllers. Fix it by updating rebind_subsystems() to compare @added_mask to @pinned and fail with -ENOENT if they don't match. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 93 +++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 65 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c108d3d1ea30..2a8cf1a7d2f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,6 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; + unsigned long pinned = 0; int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1010,20 +1011,32 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (!(bit & added_mask)) + if (!(added_mask & (1 << i))) continue; + /* is the subsystem mounted elsewhere? */ if (ss->root != &cgroup_dummy_root) { - /* Subsystem isn't free */ - return -EBUSY; + ret = -EBUSY; + goto out_put; + } + + /* pin the module */ + if (!try_module_get(ss->module)) { + ret = -ENOENT; + goto out_put; } + pinned |= 1 << i; + } + + /* subsys could be missing if unloaded between parsing and here */ + if (added_mask != pinned) { + ret = -ENOENT; + goto out_put; } ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - return ret; + goto out_put; /* * Nothing can fail from this point on. Remove files for the @@ -1067,11 +1080,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); #ifdef CONFIG_MODULE_UNLOAD BUG_ON(ss->module && !module_refcount(ss->module)); #endif @@ -1088,6 +1096,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; + +out_put: + for_each_subsys(ss, i) + if (pinned & (1 << i)) + module_put(ss->module); + return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -1138,7 +1152,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; - bool module_pin_failed = false; struct cgroup_subsys *ss; int i; @@ -1281,52 +1294,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!opts->subsys_mask && !opts->name) return -EINVAL; - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for_each_subsys(ss, i) { - if (!(opts->subsys_mask & (1UL << i))) - continue; - if (!try_module_get(cgroup_subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= 0; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - module_put(cgroup_subsys[i]->module); - } - return -ENOENT; - } - return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i; - - mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) - if (subsys_mask & (1UL << i)) - module_put(cgroup_subsys[i]->module); - mutex_unlock(&cgroup_mutex); -} - static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1384,8 +1354,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - if (ret) - drop_parsed_module_refcounts(opts.subsys_mask); return ret; } @@ -1591,7 +1559,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto drop_modules; + goto out_err; } opts.new_root = new_root; @@ -1600,7 +1568,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_free_root(opts.new_root); - goto drop_modules; + goto out_err; } root = sb->s_fs_info; @@ -1708,9 +1676,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1728,8 +1693,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -4837,7 +4800,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it + * try_module_get() in rebind_subsystems() should ensure that it * doesn't start being used while we're killing it off. */ BUG_ON(ss->root != &cgroup_dummy_root); -- cgit v1.2.1 From a698b4488ab98deef6c3beeba3e27fea17650132 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 21:08:27 -0700 Subject: cgroup: remove gratuituous BUG_ON()s from rebind_subsystems() rebind_subsystems() performs santiy checks even on subsystems which aren't specified to be added or removed and the checks aren't all that useful given that these are in a very cold path while the violations they check would trip up in much hotter paths. Let's remove these from rebind_subsystems(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a8cf1a7d2f4..345fac8e4fba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1077,15 +1077,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* subsystem is now free - drop reference on module */ module_put(ss->module); root->subsys_mask &= ~bit; - } else if (bit & root->subsys_mask) { - /* Subsystem state should already exist */ - BUG_ON(!cgrp->subsys[i]); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); } } -- cgit v1.2.1 From 2a4ac63333584b2791986cf2270f5ba9a4b97606 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:16:40 +0800 Subject: cgroup: remove sparse tags from offline_css() This should have been removed in commit d7eeac1913ff ("cgroup: hold cgroup_mutex before calling css_offline"). While at it, update the comments. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 345fac8e4fba..41b559f51502 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4214,7 +4214,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } -/* invoke ->post_create() on a new CSS and mark it online if successful */ +/* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { int ret = 0; @@ -4228,9 +4228,8 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return ret; } -/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v1.2.1 From e0798ce27346edb8aa369b5b39af5a47fdf2b25c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 17:36:25 +0800 Subject: cgroup: remove struct cgroup_seqfile_state We can use struct cfent instead. v2: - remove cgroup_seqfile_release(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 41b559f51502..ed2104304833 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2397,11 +2397,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, * supports string->u64 maps, but can be extended in future. */ -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; @@ -2410,59 +2405,45 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; + struct cfent *cfe = m->private; + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(state->cgroup, cft, &cb); + return cft->read_map(cgrp, cft, &cb); } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); + return cft->read_seq_string(cgrp, cft, m); } static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_seqfile_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { int err; + struct cfent *cfe; struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); + cfe = __d_cfe(file->f_dentry); + cft = cfe->type; if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state; - - state = kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) + err = single_open(file, cgroup_seqfile_show, cfe); + } else if (cft->open) { err = cft->open(inode, file); - else - err = 0; + } return err; } -- cgit v1.2.1 From 6f4b7e632d78c2d91502211c430722cc66428492 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:18:36 +0800 Subject: cgroup: more naming cleanups Constantly use @cset for css_set variables and use @cgrp as cgroup variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed2104304833..9577bebe2546 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -466,7 +466,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * - * Returns true if "cg" matches "old_cg" except for the hierarchy + * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, @@ -1839,7 +1839,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; - struct css_set *cg; + struct css_set *cset; }; struct cgroup_taskset { @@ -2057,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, tc = flex_array_get(group, i); old_cset = task_css_set(tc->task); - tc->cg = find_css_set(old_cset, cgrp); - if (!tc->cg) { + tc->cset = find_css_set(old_cset, cgrp); + if (!tc->cset) { retval = -ENOMEM; goto out_put_css_set_refs; } @@ -2071,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } /* nothing is sensitive to fork() after this point. */ @@ -2091,9 +2091,9 @@ out_put_css_set_refs: if (retval) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - if (!tc->cg) + if (!tc->cset) break; - put_css_set(tc->cg); + put_css_set(tc->cset); } } out_cancel_attach: @@ -2203,9 +2203,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk, false); + retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } @@ -3305,8 +3305,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { + cgroup_iter_start(scan->cgrp, &it); + while ((p = cgroup_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3339,7 +3339,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cg, &it); + cgroup_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3385,7 +3385,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { struct cgroup_scanner scan; - scan.cg = from; + scan.cgrp = from; scan.test_task = NULL; /* select all tasks in cgroup */ scan.process_task = cgroup_transfer_one_task; scan.heap = NULL; -- cgit v1.2.1 From 4e96ee8e981b5140a2bcc5fff0d5c0eef39a62ee Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 09:50:50 +0800 Subject: cgroup: convert cgroup_ida to cgroup_idr This enables us to lookup a cgroup by its id. v4: - add a comment for idr_remove() in cgroup_offline_fn(). v3: - on success, idr_alloc() returns the id but not 0, so fix the BUG_ON() in cgroup_init(). - pass the right value to idr_alloc() so that the id for dummy cgroup is 0. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo --- kernel/cgroup.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9577bebe2546..3f6593333525 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -866,8 +866,6 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - /* * Drop the active superblock reference that we took when we * created the cgroup. This will free cgrp->root, if we are @@ -1379,6 +1377,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) cgrp->root = root; RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); + idr_init(&root->cgroup_idr); } static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) @@ -1451,7 +1450,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; - ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1467,7 +1465,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) /* hierarhcy ID shoulid already have been released */ WARN_ON_ONCE(root->hierarchy_id); - ida_destroy(&root->cgroup_ida); + idr_destroy(&root->cgroup_idr); kfree(root); } } @@ -1582,6 +1580,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, + 0, 1, GFP_KERNEL); + if (root_cgrp->id < 0) + goto unlock_drop; + /* Check for name clashes with existing mounts */ ret = -EBUSY; if (strlen(root->name)) @@ -4253,7 +4256,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_cgrp; rcu_assign_pointer(cgrp->name, name); - cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); if (cgrp->id < 0) goto err_free_name; @@ -4351,6 +4358,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err) goto err_destroy; @@ -4377,7 +4386,7 @@ err_free_all: /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free_id: - ida_simple_remove(&root->cgroup_ida, cgrp->id); + idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: @@ -4570,6 +4579,14 @@ static void cgroup_offline_fn(struct work_struct *work) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); + /* + * We should remove the cgroup object from idr before its grace + * period starts, so we won't be looking up a cgroup while the + * cgroup is being freed. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); @@ -4895,6 +4912,10 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, + 0, 1, GFP_KERNEL); + BUG_ON(err < 0); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); -- cgit v1.2.1 From 876ede8b2b9880615be0de3ec7b8afd0a1786e76 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:51:47 +0800 Subject: cgroup: restructure the failure path in cgroup_write_event_control() It uses a single label and checks the validity of each pointer. This is err-prone, and actually we had a bug because one of the check was insufficient. Use multi lables as we do in other places. v2: - drop initializations of local variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f6593333525..9f6dab22289e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3934,11 +3934,11 @@ static void cgroup_event_ptable_queue_proc(struct file *file, static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { - struct cgroup_event *event = NULL; + struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; + struct file *efile; + struct file *cfile; char *endp; int ret; @@ -3964,31 +3964,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, efile = eventfd_fget(efd); if (IS_ERR(efile)) { ret = PTR_ERR(efile); - goto fail; + goto out_kfree; } event->eventfd = eventfd_ctx_fileget(efile); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto fail; + goto out_put_efile; } cfile = fget(cfd); if (!cfile) { ret = -EBADF; - goto fail; + goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) - goto fail; + goto out_put_cfile; event->cft = __file_cft(cfile); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); - goto fail; + goto out_put_cfile; } /* @@ -3998,18 +3998,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } ret = event->cft->register_event(cgrp, event->cft, event->eventfd, buffer); if (ret) - goto fail; + goto out_put_cfile; efile->f_op->poll(efile, &event->pt); @@ -4029,16 +4029,13 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, return 0; -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - +out_put_cfile: + fput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fput(efile); +out_kfree: kfree(event); return ret; -- cgit v1.2.1 From b395890a092d8ecbe54f005179e3dec4b6bf752a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:52:15 +0800 Subject: cgroup: rename cgroup_pidlist->mutex It's a rw_semaphore not a mutex. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9f6dab22289e..9420662df87e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3436,7 +3436,7 @@ struct cgroup_pidlist { /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* protects the other fields */ - struct rw_semaphore mutex; + struct rw_semaphore rwsem; }; /* @@ -3509,7 +3509,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, struct pid_namespace *ns = task_active_pid_ns(current); /* - * We can't drop the pidlist_mutex before taking the l->mutex in case + * We can't drop the pidlist_mutex before taking the l->rwsem in case * the last ref-holder is trying to remove l from the list at the same * time. Holding the pidlist_mutex precludes somebody taking whichever * list we find out from under us - compare release_pid_array(). @@ -3518,7 +3518,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); + down_write(&l->rwsem); mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3529,8 +3529,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_unlock(&cgrp->pidlist_mutex); return l; } - init_rwsem(&l->mutex); - down_write(&l->mutex); + init_rwsem(&l->rwsem); + down_write(&l->rwsem); l->key.type = type; l->key.ns = get_pid_ns(ns); l->owner = cgrp; @@ -3591,7 +3591,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l->list = array; l->length = length; l->use_count++; - up_write(&l->mutex); + up_write(&l->rwsem); *lp = l; return 0; } @@ -3669,7 +3669,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&l->rwsem); if (pid) { int end = l->length; @@ -3696,7 +3696,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + up_read(&l->rwsem); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) @@ -3742,7 +3742,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) * pidlist_mutex, we have to take pidlist_mutex first. */ mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); + down_write(&l->rwsem); BUG_ON(!l->use_count); if (!--l->use_count) { /* we're the last user if refcount is 0; remove and free */ @@ -3750,12 +3750,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) mutex_unlock(&l->owner->pidlist_mutex); pidlist_free(l->list); put_pid_ns(l->key.ns); - up_write(&l->mutex); + up_write(&l->rwsem); kfree(l); return; } mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&l->rwsem); } static int cgroup_pidlist_release(struct inode *inode, struct file *file) -- cgit v1.2.1 From 8af01f56a03e9cbd91a55d688fce1315021efba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: s/cgroup_subsys_state/cgroup_css/ s/task_subsys_state/task_css/ The names of the two struct cgroup_subsys_state accessors - cgroup_subsys_state() and task_subsys_state() - are somewhat awkward. The former clashes with the type name and the latter doesn't even indicate it's somehow related to cgroup. We're about to revamp large portion of cgroup API, so, let's rename them so that they're less awkward. Most per-controller usages of the accessors are localized in accessor wrappers and given the amount of scheduled changes, this isn't gonna add any noticeable headache. Rename cgroup_subsys_state() to cgroup_css() and task_subsys_state() to task_css(). This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae4c46834633..0b3caa3220cb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -81,7 +81,7 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ #else static DEFINE_MUTEX(cgroup_mutex); #endif -- cgit v1.2.1 From 72c97e54e0f043d33b246d7460ae0a36c4b8c643 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: add subsystem pointer to cgroup_subsys_state Currently, given a cgroup_subsys_state, there's no way to find out which subsystem the css is for, which we'll need to convert the cgroup controller API to primarily use @css instead of @cgroup. This patch adds cgroup_subsys_state->ss which points to the subsystem the @css belongs to. While at it, remove the comment about accessing @css->cgroup to determine the hierarchy. cgroup core will provide API to traverse hierarchy of css'es and we don't want subsystems to directly walk cgroup hierarchies anymore. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b3caa3220cb..4234428f1014 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4186,6 +4186,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; + css->ss = ss; css->flags = 0; css->id = NULL; if (cgrp == cgroup_dummy_top) -- cgit v1.2.1 From eb95419b023abacb415e2a18fea899023ce7624d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- kernel/cgroup.c | 57 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4234428f1014..271d9a5cde5f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -853,8 +853,11 @@ static void cgroup_free_fn(struct work_struct *work) /* * Release the subsystem state objects. */ - for_each_root_subsys(cgrp->root, ss) - ss->css_free(cgrp); + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + ss->css_free(css); + } cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1056,7 +1059,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp); + ss->bind(cgrp->subsys[i]); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; @@ -1066,7 +1069,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top); + ss->bind(cgroup_dummy_top->subsys[i]); cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -2049,8 +2052,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); + retval = ss->can_attach(css, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2089,8 +2094,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->attach) - ss->attach(cgrp, &tset); + ss->attach(css, &tset); } /* @@ -2109,10 +2116,12 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); + ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -4206,14 +4215,15 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) - ret = ss->css_online(cgrp); + ret = ss->css_online(css); if (!ret) - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + css->flags |= CSS_ONLINE; return ret; } @@ -4228,9 +4238,9 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; if (ss->css_offline) - ss->css_offline(cgrp); + ss->css_offline(css); - cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; + css->flags &= ~CSS_ONLINE; } /* @@ -4305,7 +4315,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgrp); + css = ss->css_alloc(parent->subsys[ss->subsys_id]); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4313,7 +4323,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = percpu_ref_init(&css->refcnt, css_release); if (err) { - ss->css_free(cgrp); + ss->css_free(css); goto err_free_all; } @@ -4386,7 +4396,7 @@ err_free_all: if (css) { percpu_ref_cancel_init(&css->refcnt); - ss->css_free(cgrp); + ss->css_free(css); } } mutex_unlock(&cgroup_mutex); @@ -4641,7 +4651,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4720,7 +4730,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4836,7 +4846,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top); + ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5192,10 +5202,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = cset->subsys[i]->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); - ss->exit(cgrp, old_cgrp, tsk); + ss->exit(css, old_css, tsk); } } } @@ -5529,7 +5539,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5539,9 +5550,9 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) return css; } -static void debug_css_free(struct cgroup *cgrp) +static void debug_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp->subsys[debug_subsys_id]); + kfree(css); } static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) -- cgit v1.2.1 From 2bb566cb68dfafad328af666ebadf0e49accd6ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add subsys backlink pointer to cftype cgroup is transitioning to using css (cgroup_subsys_state) instead of cgroup as the primary subsystem handle. The cgroupfs file interface will be converted to use css's which requires finding out the subsystem from cftype so that the matching css can be determined from the cgroup. This patch adds cftype->ss which points to the subsystem the file belongs to. The field is initialized while a cftype is being registered. This makes it unnecessary to explicitly specify the subsystem for other cftype handling functions. @ss argument dropped from various cftype handling functions. This patch shouldn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Cc: Jens Axboe --- kernel/cgroup.c | 78 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 35 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 271d9a5cde5f..c4bc8dac3b1d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -219,8 +219,8 @@ static struct cftype cgroup_base_files[]; static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add); /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) @@ -974,7 +974,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, NULL, set->cfts, false); + cgroup_addrm_files(cgrp, set->cfts, false); } } @@ -1623,7 +1623,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cred = override_creds(&init_cred); - ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); if (ret) goto rm_base_files; @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, rm_base_files: free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); @@ -2694,8 +2694,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2705,8 +2704,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, subsys->name); + if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + strcpy(name, cft->ss->name); strcat(name, "."); } strcat(name, cft->name); @@ -2743,17 +2742,16 @@ out: /** * cgroup_addrm_files - add or remove files to a cgroup directory * @cgrp: the target cgroup - * @subsys: the subsystem of files to be added * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * All @cfts should belong to @subsys. For removals, this function never - * fails. If addition fails, this function doesn't remove files already - * added. The caller is responsible for cleaning up. + * For removals, this function never fails. If addition fails, this + * function doesn't remove files already added. The caller is responsible + * for cleaning up. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft; int ret; @@ -2771,7 +2769,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - ret = cgroup_add_file(cgrp, subsys, cft); + ret = cgroup_add_file(cgrp, cft); if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", cft->name, ret); @@ -2796,11 +2794,11 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static int cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); + struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *cgrp, *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; @@ -2828,7 +2826,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2851,7 +2849,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2883,51 +2881,56 @@ out_deact: int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + struct cftype *cft; int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; + cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(ss, cfts, true); + ret = cgroup_cfts_commit(cfts, true); if (ret) - cgroup_rm_cftypes(ss, cfts); + cgroup_rm_cftypes(cfts); return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * - * Unregister @cfts from @ss. Files described by @cfts are removed from - * all existing cgroups to which @ss is attached and all future cgroups - * won't have them either. This function can be called anytime whether @ss - * is attached or not. + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered with @ss. + * registered. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_rm_cftypes(struct cftype *cfts) { struct cftype_set *set; + if (!cfts || !cfts[0].ss) + return -ENOENT; + cgroup_cfts_prepare(); - list_for_each_entry(set, &ss->cftsets, node) { + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { if (set->cfts == cfts) { list_del(&set->node); kfree(set); - cgroup_cfts_commit(ss, cfts, false); + cgroup_cfts_commit(cfts, false); return 0; } } - cgroup_cfts_commit(ss, NULL, false); + cgroup_cfts_commit(NULL, false); return -ENOENT; } @@ -4148,7 +4151,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) continue; list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + ret = cgroup_addrm_files(cgrp, set->cfts, true); if (ret < 0) goto err; } @@ -4377,7 +4380,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; @@ -4538,7 +4541,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * but we aren't quite done with @cgrp yet, so hold onto it. */ cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); - cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); + cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); @@ -4632,6 +4635,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) * deregistration. */ if (ss->base_cftypes) { + struct cftype *cft; + + for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) + cft->ss = ss; + ss->base_cftset.cfts = ss->base_cftypes; list_add_tail(&ss->base_cftset.node, &ss->cftsets); } -- cgit v1.2.1 From f7d58818ba4249f04a83b73aaac135640050bb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pin cgroup_subsys_state when opening a cgroupfs file Previously, each file read/write operation relied on the inode reference count pinning the cgroup and simply checked whether the cgroup was marked dead before proceeding to invoke the per-subsystem callback. This was rather silly as it didn't have any synchronization or css pinning around the check and the cgroup may be removed and all css refs drained between the DEAD check and actual method invocation. This patch pins the css between open() and release() so that it is guaranteed to be alive for all file operations and remove the silly DEAD checks from cgroup_file_read/write(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c4bc8dac3b1d..583f8f66a7e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2277,6 +2277,17 @@ static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, return 0; } +/* return the css for the given cgroup file */ +static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) +{ + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + + if (cft->ss) + return cgrp->subsys[cft->ss->subsys_id]; + return NULL; +} + /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2354,8 +2365,6 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) @@ -2399,9 +2408,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; - if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); if (cft->read_u64) @@ -2447,15 +2453,22 @@ static const struct file_operations cgroup_seqfile_operations = { static int cgroup_file_open(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); int err; - struct cfent *cfe; - struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cfe = __d_cfe(file->f_dentry); - cft = cfe->type; + + /* + * If the file belongs to a subsystem, pin the css. Will be + * unpinned either on open failure or release. This ensures that + * @css stays alive for all file operations. + */ + if (css && !css_tryget(css)) + return -ENODEV; if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; @@ -2464,15 +2477,23 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } + if (css && err) + css_put(css); return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); + int ret = 0; + if (cft->release) - return cft->release(inode, file); - return 0; + ret = cft->release(inode, file); + if (css) + css_put(css); + return ret; } /* -- cgit v1.2.1 From 67f4c36f83455b253445b2cb28ac9a2c4f85d99a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: add cgroup->dummy_css cgroup subsystem API is being converted to use css (cgroup_subsys_state) as the main handle, which makes things a bit awkward for subsystem agnostic core features - the "cgroup.*" interface files and various iterations - a bit awkward as they don't have a css to use. This patch adds cgroup->dummy_css which has NULL ->ss and whose only role is pointing back to the cgroup. This will be used to support subsystem agnostic features on the coming css based API. css_parent() is updated to handle dummy_css's. Note that css will soon grow its own ->parent field and css_parent() will be made trivial. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 583f8f66a7e1..c049992f1ffa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1365,6 +1365,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + cgrp->dummy_css.cgroup = cgrp; INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); @@ -2285,7 +2286,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) if (cft->ss) return cgrp->subsys[cft->ss->subsys_id]; - return NULL; + return &cgrp->dummy_css; } /* A buffer size big enough for numbers or short strings */ @@ -2467,7 +2468,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css && !css_tryget(css)) + if (css->ss && !css_tryget(css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2477,7 +2478,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } - if (css && err) + if (css->ss && err) css_put(css); return err; } @@ -2491,7 +2492,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) if (cft->release) ret = cft->release(inode, file); - if (css) + if (css->ss) css_put(css); return ret; } -- cgit v1.2.1 From 182446d087906de40e514573a92a97b203695f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- kernel/cgroup.c | 162 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 83 insertions(+), 79 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c049992f1ffa..6ee469837fda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2235,34 +2235,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +static int cgroup_tasks_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 pid) { - return attach_task_by_pid(cgrp, pid, false); + return attach_task_by_pid(css->cgroup, pid, false); } -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +static int cgroup_procs_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 tgid) { - return attach_task_by_pid(cgrp, tgid, true); + return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_release_agent_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); if (strlen(buffer) >= PATH_MAX) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); + strcpy(css->cgroup->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_release_agent_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { + struct cgroup *cgrp = css->cgroup; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); @@ -2271,10 +2275,10 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); return 0; } @@ -2292,10 +2296,10 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2313,22 +2317,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); + retval = cft->write_u64(css, cft, val); } else { s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); + retval = cft->write_s64(css, cft, val); } if (!retval) retval = nbytes; return retval; } -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2351,7 +2355,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); + retval = cft->write_string(css, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -2361,60 +2365,60 @@ out: } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); + return cft->write(css, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_string(css, cft, file, buf, nbytes, ppos); if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); + int ret = cft->trigger(css, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); + u64 val = cft->read_u64(css, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); + s64 val = cft->read_s64(css, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); + return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2433,16 +2437,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(cgrp, cft, &cb); + return cft->read_map(css, cft, &cb); } - return cft->read_seq_string(cgrp, cft, m); + return cft->read_seq_string(css, cft, m); } static const struct file_operations cgroup_seqfile_operations = { @@ -3860,21 +3864,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) { - return notify_on_release(cgrp); + return notify_on_release(css->cgroup); } -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &cgrp->flags); + clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } @@ -3972,9 +3975,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { + struct cgroup *cgrp = css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4082,20 +4086,19 @@ out_kfree: return ret; } -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } @@ -5585,17 +5588,19 @@ static void debug_css_free(struct cgroup_subsys_state *css) kfree(css); } -static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return cgroup_task_count(cgrp); + return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cgrp, +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, struct cftype *cft) { u64 count; @@ -5606,7 +5611,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, return count; } -static int current_css_set_cg_links_read(struct cgroup *cgrp, +static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *seq) { @@ -5633,14 +5638,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cgrp, - struct cftype *cft, - struct seq_file *seq) +static int cgroup_css_links_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) { + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; @@ -5659,9 +5663,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, return 0; } -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &cgrp->flags); + return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); } static struct cftype debug_files[] = { -- cgit v1.2.1 From 3b287a505ef4024634beb12a93773254909d5dae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: convert cgroup_next_sibling() to cgroup_next_child() cgroup is transitioning to using css (cgroup_subsys_state) as the main subsys interface handle instead of cgroup and the iterators will be updated to use css too. The iterators need to walk the cgroup hierarchy and return the css's matching the origin css, which is a bit cumbersome to open code. This patch converts cgroup_next_sibling() to cgroup_next_child() so that it can handle all steps of direct child iteration. This will be used to update iterators to take @css instead of @cgrp. In addition to the new iteration init handling, cgroup_next_child() is restructured so that the different branches share the end of iteration condition check. This patch doesn't change any behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 59 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ee469837fda..dd55244952bd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3037,15 +3037,16 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_sibling - find the next sibling of a given cgroup - * @pos: the current cgroup + * cgroup_next_child - find the next child of a given cgroup + * @pos: the current position (%NULL to initiate traversal) + * @cgrp: cgroup whose descendants to walk * - * This function returns the next sibling of @pos and should be called - * under RCU read lock. The only requirement is that @pos is accessible. - * The next sibling is guaranteed to be returned regardless of @pos's - * state. + * This function returns the next child of @cgrp and should be called under + * RCU read lock. The only requirement is that @cgrp and @pos are + * accessible. The next sibling is guaranteed to be returned regardless of + * their states. */ -struct cgroup *cgroup_next_sibling(struct cgroup *pos) +struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) { struct cgroup *next; @@ -3061,30 +3062,30 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) * safe to dereference from this RCU critical section. If * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed * to be visible as %true here. + * + * If @pos is dead, its next pointer can't be dereferenced; + * however, as each cgroup is given a monotonically increasing + * unique serial number and always appended to the sibling list, + * the next one can be found by walking the parent's children until + * we see a cgroup with higher serial number than @pos's. While + * this path can be slower, it's taken only when either the current + * cgroup is removed or iteration and removal race. */ - if (likely(!cgroup_is_dead(pos))) { + if (!pos) { + next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + } else if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) - return next; - return NULL; + } else { + list_for_each_entry_rcu(next, &cgrp->children, sibling) + if (next->serial_nr > pos->serial_nr) + break; } - /* - * Can't dereference the next pointer. Each cgroup is given a - * monotonically increasing unique serial number and always - * appended to the sibling list, so the next one can be found by - * walking the parent's children until we see a cgroup with higher - * serial number than @pos's. - * - * While this path can be slow, it's taken only when either the - * current cgroup is removed or iteration and removal race. - */ - list_for_each_entry_rcu(next, &pos->parent->children, sibling) - if (next->serial_nr > pos->serial_nr) - return next; + if (&next->sibling != &cgrp->children) + return next; return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_sibling); +EXPORT_SYMBOL_GPL(cgroup_next_child); /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk @@ -3117,7 +3118,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; @@ -3198,7 +3199,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return cgroup_leftmost_descendant(next); @@ -4549,9 +4550,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to + * CGRP_DEAD assertion is depended upon by cgroup_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_sibling() for details. + * cgroup_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit v1.2.1 From f48e3924dca268c677c4e338e5d91ad9e6fe6b9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: always use cgroup_next_child() to walk the children list There are several places where the children list is accessed directly. This patch converts those places to use cgroup_next_child(). This will help updating the hierarchy iterators to use @css instead of @cgrp. While cgroup_next_child() can be heavy in pathological cases - e.g. a lot of dead children, this shouldn't cause any noticeable behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dd55244952bd..2b7354faaca7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3112,7 +3112,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, pos = cgroup; /* visit the first child if exists */ - next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + next = cgroup_next_child(NULL, pos); if (next) return next; @@ -3151,7 +3151,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - list_for_each_entry_rcu(tmp, &last->children, sibling) + cgroup_for_each_child(tmp, last) pos = tmp; } while (pos); @@ -3165,8 +3165,7 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) do { last = pos; - pos = list_first_or_null_rcu(&pos->children, struct cgroup, - sibling); + pos = cgroup_next_child(NULL, pos); } while (pos); return last; -- cgit v1.2.1 From 492eb21b98f88e411a8bb43d6edcd7d7022add10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:25 -0400 Subject: cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup cgroup is currently in the process of transitioning to using css (cgroup_subsys_state) as the primary handle instead of cgroup in subsystem API. For hierarchy iterators, this is beneficial because * In most cases, css is the only thing subsystems care about anyway. * On the planned unified hierarchy, iterations for different subsystems will need to skip over different subtrees of the hierarchy depending on which subsystems are enabled on each cgroup. Passing around css makes it unnecessary to explicitly specify the subsystem in question as css is intersection between cgroup and subsystem * For the planned unified hierarchy, css's would need to be created and destroyed dynamically independent from cgroup hierarchy. Having cgroup core manage css iteration makes enforcing deref rules a lot easier. Most subsystem conversions are straight-forward. Noteworthy changes are * blkio: cgroup_to_blkcg() is no longer used. Removed. * freezer: cgroup_freezer() is no longer used. Removed. * devices: cgroup_to_devcgroup() is no longer used. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe --- kernel/cgroup.c | 131 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 58 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2b7354faaca7..91eac33fac86 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2814,8 +2814,8 @@ static void cgroup_cfts_prepare(void) /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we use cgroup_for_each_descendant_pre() and drop RCU - * read lock before calling cgroup_addrm_files(). + * Instead, we use css_for_each_descendant_pre() and drop RCU read + * lock before calling cgroup_addrm_files(). */ mutex_lock(&cgroup_mutex); } @@ -2825,10 +2825,11 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *cgrp, *root = &ss->root->top_cgroup; + struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; struct inode *inode; + struct cgroup_subsys_state *css; u64 update_before; int ret = 0; @@ -2861,7 +2862,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - cgroup_for_each_descendant_pre(cgrp, root) { + css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + struct cgroup *cgrp = css->cgroup; + if (cgroup_is_dead(cgrp)) continue; @@ -3037,17 +3040,21 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_child - find the next child of a given cgroup - * @pos: the current position (%NULL to initiate traversal) - * @cgrp: cgroup whose descendants to walk + * css_next_child - find the next child of a given css + * @pos_css: the current position (%NULL to initiate traversal) + * @parent_css: css whose children to walk * - * This function returns the next child of @cgrp and should be called under - * RCU read lock. The only requirement is that @cgrp and @pos are - * accessible. The next sibling is guaranteed to be returned regardless of - * their states. + * This function returns the next child of @parent_css and should be called + * under RCU read lock. The only requirement is that @parent_css and + * @pos_css are accessible. The next sibling is guaranteed to be returned + * regardless of their states. */ -struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) +struct cgroup_subsys_state * +css_next_child(struct cgroup_subsys_state *pos_css, + struct cgroup_subsys_state *parent_css) { + struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; + struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3081,59 +3088,64 @@ struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) break; } - if (&next->sibling != &cgrp->children) - return next; - return NULL; + if (&next->sibling == &cgrp->children) + return NULL; + + if (parent_css->ss) + return cgroup_css(next, parent_css->ss->subsys_id); + else + return &next->dummy_css; } -EXPORT_SYMBOL_GPL(cgroup_next_child); +EXPORT_SYMBOL_GPL(css_next_child); /** - * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_pre(). Find the next - * descendant to visit for pre-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_pre(). Find the next descendant + * to visit for pre-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * and @root are accessible and @pos is a descendant of @root. */ -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @cgroup */ + /* if first iteration, pretend we just visited @root */ if (!pos) - pos = cgroup; + pos = root; /* visit the first child if exists */ - next = cgroup_next_child(NULL, pos); + next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ - while (pos != cgroup) { - next = cgroup_next_child(pos, pos->parent); + while (pos != root) { + next = css_next_child(pos, css_parent(pos)); if (next) return next; - pos = pos->parent; + pos = css_parent(pos); } return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** - * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup - * @pos: cgroup of interest + * css_rightmost_descendant - return the rightmost descendant of a css + * @pos: css of interest * - * Return the rightmost descendant of @pos. If there's no descendant, - * @pos is returned. This can be used during pre-order traversal to skip + * Return the rightmost descendant of @pos. If there's no descendant, @pos + * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires RCU read locking, it doesn't require the @@ -3141,9 +3153,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * function will return the correct rightmost descendant as long as @pos is * accessible. */ -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last, *tmp; + struct cgroup_subsys_state *last, *tmp; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3151,62 +3164,64 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - cgroup_for_each_child(tmp, last) + css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } -EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +EXPORT_SYMBOL_GPL(css_rightmost_descendant); -static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +static struct cgroup_subsys_state * +css_leftmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last; + struct cgroup_subsys_state *last; do { last = pos; - pos = cgroup_next_child(NULL, pos); + pos = css_next_child(NULL, pos); } while (pos); return last; } /** - * cgroup_next_descendant_post - find the next descendant for post-order walk + * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_post(). Find the next - * descendant to visit for post-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_post(). Find the next descendant + * to visit for post-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. */ -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, visit the leftmost descendant */ if (!pos) { - next = cgroup_leftmost_descendant(cgroup); - return next != cgroup ? next : NULL; + next = css_leftmost_descendant(root); + return next != root ? next : NULL; } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_child(pos, pos->parent); + next = css_next_child(pos, css_parent(pos)); if (next) - return cgroup_leftmost_descendant(next); + return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = pos->parent; - return next != cgroup ? next : NULL; + next = css_parent(pos); + return next != root ? next : NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); +EXPORT_SYMBOL_GPL(css_next_descendant_post); void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) @@ -4549,9 +4564,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_child() to + * CGRP_DEAD assertion is depended upon by css_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_child() for details. + * css_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit v1.2.1 From d515876e9d951d8cf7fc7c90db2967664bdc89ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: relocate cgroup_advance_iter() For some reason, cgroup_advance_iter() is standing lonely all away from its iter comrades. Relocate it. This is cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 91eac33fac86..d56d9363d4b3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2981,30 +2981,6 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) -{ - struct list_head *l = it->cset_link; - struct cgrp_cset_link *link; - struct css_set *cset; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->cset_links) { - it->cset_link = NULL; - return; - } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } while (list_empty(&cset->tasks)); - it->cset_link = l; - it->task = cset->tasks.next; -} - /* * To reduce the fork() overhead for systems that are not actually * using their cgroups capability, we don't maintain the lists running @@ -3223,6 +3199,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); +/* + * Advance a list_head iterator. The iterator should be positioned at + * the start of a css_set + */ +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +{ + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; + struct css_set *cset; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &cgrp->cset_links) { + it->cset_link = NULL; + return; + } + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } while (list_empty(&cset->tasks)); + it->cset_link = l; + it->task = cset->tasks.next; +} + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { -- cgit v1.2.1 From 0942eeeef68f9493c1bcb1a52baf612b73fcf9fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: rename cgroup_iter to cgroup_task_iter cgroup now has multiple iterators and it's quite confusing to have something which walks over tasks of a single cgroup named cgroup_iter. Let's rename it to cgroup_task_iter. While at it, reformat / update comments and replace the overview comment above the interface function decls with proper function comments. Such overview can be useful but function comments should be more than enough here. This is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 114 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 37 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d56d9363d4b3..15c93f9c9e57 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -367,9 +367,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ +/* + * css_set_lock protects the list of css_set objects, and the chain of + * tasks off each css_set. Nests outside task->alloc_lock due to + * cgroup_task_iter_start(). + */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -394,10 +396,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ +/* + * We don't maintain the lists running through each css_set to its task + * until after the first call to cgroup_task_iter_start(). This reduces + * the fork()/exit() overhead for people who have cgroups compiled into + * their kernel but not actually in use. + */ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) @@ -2982,10 +2986,10 @@ int cgroup_task_count(const struct cgroup *cgrp) } /* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first call to cgroup_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3199,11 +3203,15 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set +/** + * cgroup_advance_task_iter - advance a task itererator to the next css_set + * @cgrp: the cgroup to walk tasks of + * @it: the iterator to advance + * + * Advance @it to the next css_set to walk. */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +static void cgroup_advance_task_iter(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3223,7 +3231,21 @@ static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) it->task = cset->tasks.next; } -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_start - initiate task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to use + * + * Initiate iteration through the tasks of @cgrp. The caller can call + * cgroup_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, cgroup_task_iter_end() must + * be called. + * + * Note that this function acquires a lock which is released when the + * iteration finishes. The caller can't sleep while iteration is in + * progress. + */ +void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) __acquires(css_set_lock) { /* @@ -3236,11 +3258,20 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) read_lock(&css_set_lock); it->cset_link = &cgrp->cset_links; - cgroup_advance_iter(cgrp, it); + cgroup_advance_task_iter(cgrp, it); } -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) +/** + * cgroup_task_iter_next - return the next task for the iterator + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator being iterated + * + * The "next" function for task iteration. @it should have been + * initialized via cgroup_task_iter_start(). Returns NULL when the + * iteration reaches the end. + */ +struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3254,16 +3285,25 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, l = l->next; link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); if (l == &link->cset->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); + /* + * We reached the end of this task list - move on to the + * next cgrp_cset_link. + */ + cgroup_advance_task_iter(cgrp, it); } else { it->task = l; } return res; } -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_end - finish task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to finish + * + * Finish task iteration started by cgroup_task_iter_start(). + */ +void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3312,7 +3352,7 @@ static inline int started_after(void *p1, void *p2) * Iterate through all the tasks in a cgroup, calling test_task() for each, * and if it returns true, call process_task() for it also. * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() + * Effectively duplicates cgroup_task_iter_{start,next,end}() * but does not lock css_set_lock for the call to process_task(). * The struct cgroup_scanner may be embedded in any structure of the caller's * creation. @@ -3333,7 +3373,7 @@ static inline int started_after(void *p1, void *p2) int cgroup_scan_tasks(struct cgroup_scanner *scan) { int retval, i; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3368,8 +3408,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cgrp, &it); - while ((p = cgroup_iter_next(scan->cgrp, &it))) { + cgroup_task_iter_start(scan->cgrp, &it); + while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3402,7 +3442,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3608,7 +3648,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3623,8 +3663,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3635,7 +3675,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3669,7 +3709,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; /* @@ -3683,8 +3723,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3704,7 +3744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); err: return ret; @@ -5137,7 +5177,7 @@ void cgroup_fork(struct task_struct *child) * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to - * cgroup_iter_start() - to guarantee that the new task ends up on its + * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) -- cgit v1.2.1 From c59cd3d840b1b0a8f996cbbd9132128dcaabbeb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cgroup_task_iter remember the cgroup being iterated Currently all cgroup_task_iter functions require @cgrp to be passed in, which is superflous and increases chance of usage error. Make cgroup_task_iter remember the cgroup being iterated and drop @cgrp argument from next and end functions. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15c93f9c9e57..abc62ea1303c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3205,13 +3205,11 @@ EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * cgroup_advance_task_iter - advance a task itererator to the next css_set - * @cgrp: the cgroup to walk tasks of * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup *cgrp, - struct cgroup_task_iter *it) +static void cgroup_advance_task_iter(struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3220,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup *cgrp, /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &cgrp->cset_links) { + if (l == &it->origin_cgrp->cset_links) { it->cset_link = NULL; return; } @@ -3257,21 +3255,22 @@ void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); + + it->origin_cgrp = cgrp; it->cset_link = &cgrp->cset_links; - cgroup_advance_task_iter(cgrp, it); + + cgroup_advance_task_iter(it); } /** * cgroup_task_iter_next - return the next task for the iterator - * @cgrp: the cgroup to walk tasks of * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via cgroup_task_iter_start(). Returns NULL when the * iteration reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, - struct cgroup_task_iter *it) +struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3289,7 +3288,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(cgrp, it); + cgroup_advance_task_iter(it); } else { it->task = l; } @@ -3298,12 +3297,11 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, /** * cgroup_task_iter_end - finish task iteration - * @cgrp: the cgroup to walk tasks of * @it: the task iterator to finish * * Finish task iteration started by cgroup_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) +void cgroup_task_iter_end(struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3409,7 +3407,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ heap->size = 0; cgroup_task_iter_start(scan->cgrp, &it); - while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { + while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3440,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_task_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3664,7 +3662,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, return -ENOMEM; /* now, populate the array */ cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3675,7 +3673,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3724,7 +3722,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) cgrp = dentry->d_fsdata; cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3744,7 +3742,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); err: return ret; -- cgit v1.2.1 From e535837b1dae17b5a2d76ea1bc22ac1a79354624 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: remove struct cgroup_scanner cgroup_scan_tasks() takes a pointer to struct cgroup_scanner as its sole argument and the only function of that struct is packing the arguments of the function call which are consisted of five fields. It's not too unusual to pack parameters into a struct when the number of arguments gets excessive or the whole set needs to be passed around a lot, but neither holds here making it just weird. Drop struct cgroup_scanner and pass the params directly to cgroup_scan_tasks(). Note that struct cpuset_change_nodemask_arg was added to cpuset.c to pass both ->cs and ->newmems pointer to cpuset_change_nodemask() using single data pointer. This doesn't make any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 93 ++++++++++++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 50 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abc62ea1303c..7b16ddb2569b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3343,32 +3343,37 @@ static inline int started_after(void *p1, void *p2) /** * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan + * @cgrp: the cgroup to iterate tasks of + * @test: optional test callback + * @process: process callback + * @data: data passed to @test and @process + * @heap: optional pre-allocated heap used for task iteration * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_task_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. + * Iterate through all the tasks in a cgroup, calling @test for each, and + * if it returns %true, call @process for it also. * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). + * @test may be NULL, meaning always true (select all tasks), which + * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * lock css_set_lock for the call to @process. + * + * It is guaranteed that @process will act on every task that is a member + * of @cgrp for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different cgroup during + * the call, or are forked or move into the cgroup during the call. + * + * Note that @test may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should be + * cheap. + * + * If @heap is non-NULL, a heap has been pre-allocated and will be used for + * heap operations (and its "gt" member will be overwritten), else a + * temporary heap will be used (allocation of which may cause this function + * to fail). */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) +int cgroup_scan_tasks(struct cgroup *cgrp, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; struct cgroup_task_iter it; @@ -3376,12 +3381,10 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; - struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; - if (scan->heap) { + if (heap) { /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ @@ -3394,25 +3397,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) again: /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This + * Scan tasks in the cgroup, using the @test callback to determine + * which are of interest, and invoking @process callback on the + * ones which need an update. Since we don't want to hold any + * locks during the task updates, gather tasks to be processed in a + * heap structure. The heap is sorted by descending task start + * time. If the statically-sized heap fills up, we overflow tasks + * that started later, and in future iterations only consider tasks + * that started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(scan->cgrp, &it); + cgroup_task_iter_start(cgrp, &it); while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ - if (scan->test_task && !scan->test_task(p, scan)) + if (test && !test(p, data)) continue; /* * Only process tasks that started after the last task @@ -3450,7 +3452,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(q, scan); + process(q, data); put_task_struct(q); } /* @@ -3467,10 +3469,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, - struct cgroup_scanner *scan) +static void cgroup_transfer_one_task(struct task_struct *task, void *data) { - struct cgroup *new_cgroup = scan->data; + struct cgroup *new_cgroup = data; mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); @@ -3484,15 +3485,7 @@ static void cgroup_transfer_one_task(struct task_struct *task, */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - struct cgroup_scanner scan; - - scan.cgrp = from; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cgroup_transfer_one_task; - scan.heap = NULL; - scan.data = to; - - return cgroup_scan_tasks(&scan); + return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); } /* -- cgit v1.2.1 From 72ec7029937f0518eff21b8762743c31591684f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make task iterators deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. This patch converts task iterators to deal with css instead of cgroup. Note that under unified hierarchy, different sets of tasks will be considered belonging to a given cgroup depending on the subsystem in question and making the iterators deal with css instead cgroup provides them with enough information about the iteration. While at it, fix several function comment formats in cpuset.c. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley --- kernel/cgroup.c | 112 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 57 insertions(+), 55 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b16ddb2569b..8c57301d0561 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -370,7 +370,7 @@ static int cgroup_init_idr(struct cgroup_subsys *ss, /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to - * cgroup_task_iter_start(). + * css_task_iter_start(). */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -398,9 +398,9 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) /* * We don't maintain the lists running through each css_set to its task - * until after the first call to cgroup_task_iter_start(). This reduces - * the fork()/exit() overhead for people who have cgroups compiled into - * their kernel but not actually in use. + * until after the first call to css_task_iter_start(). This reduces the + * fork()/exit() overhead for people who have cgroups compiled into their + * kernel but not actually in use. */ static int use_task_css_set_links __read_mostly; @@ -2989,7 +2989,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through * each css_set to its tasks until we see the list actually used - in other - * words after the first call to cgroup_task_iter_start(). + * words after the first call to css_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3204,12 +3204,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, EXPORT_SYMBOL_GPL(css_next_descendant_post); /** - * cgroup_advance_task_iter - advance a task itererator to the next css_set + * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup_task_iter *it) +static void css_advance_task_iter(struct css_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3218,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_cgrp->cset_links) { + if (l == &it->origin_css->cgroup->cset_links) { it->cset_link = NULL; return; } @@ -3230,47 +3230,48 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_start - initiate task iteration - * @cgrp: the cgroup to walk tasks of + * css_task_iter_start - initiate task iteration + * @css: the css to walk tasks of * @it: the task iterator to use * - * Initiate iteration through the tasks of @cgrp. The caller can call - * cgroup_task_iter_next() to walk through the tasks until the function - * returns NULL. On completion of iteration, cgroup_task_iter_end() must - * be called. + * Initiate iteration through the tasks of @css. The caller can call + * css_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, css_task_iter_end() must be + * called. * * Note that this function acquires a lock which is released when the * iteration finishes. The caller can't sleep while iteration is in * progress. */ -void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it) __acquires(css_set_lock) { /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. + * The first time anyone tries to iterate across a css, we need to + * enable the list linking each css_set to its tasks, and fix up + * all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->origin_cgrp = cgrp; - it->cset_link = &cgrp->cset_links; + it->origin_css = css; + it->cset_link = &css->cgroup->cset_links; - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } /** - * cgroup_task_iter_next - return the next task for the iterator + * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been - * initialized via cgroup_task_iter_start(). Returns NULL when the - * iteration reaches the end. + * initialized via css_task_iter_start(). Returns NULL when the iteration + * reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) +struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3288,7 +3289,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } else { it->task = l; } @@ -3296,12 +3297,12 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_end - finish task iteration + * css_task_iter_end - finish task iteration * @it: the task iterator to finish * - * Finish task iteration started by cgroup_task_iter_start(). + * Finish task iteration started by css_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup_task_iter *it) +void css_task_iter_end(struct css_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3342,24 +3343,24 @@ static inline int started_after(void *p1, void *p2) } /** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @cgrp: the cgroup to iterate tasks of + * css_scan_tasks - iterate though all the tasks in a css + * @css: the css to iterate tasks of * @test: optional test callback * @process: process callback * @data: data passed to @test and @process * @heap: optional pre-allocated heap used for task iteration * - * Iterate through all the tasks in a cgroup, calling @test for each, and - * if it returns %true, call @process for it also. + * Iterate through all the tasks in @css, calling @test for each, and if it + * returns %true, call @process for it also. * * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * effectively duplicates css_task_iter_{start,next,end}() but does not * lock css_set_lock for the call to @process. * * It is guaranteed that @process will act on every task that is a member - * of @cgrp for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different cgroup during - * the call, or are forked or move into the cgroup during the call. + * of @css for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different css during the + * call, or are forked or move into the css during the call. * * Note that @test may be called with locks held, and may in some * situations be called multiple times for the same task, so it should be @@ -3370,13 +3371,13 @@ static inline int started_after(void *p1, void *p2) * temporary heap will be used (allocation of which may cause this function * to fail). */ -int cgroup_scan_tasks(struct cgroup *cgrp, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3397,7 +3398,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, again: /* - * Scan tasks in the cgroup, using the @test callback to determine + * Scan tasks in the css, using the @test callback to determine * which are of interest, and invoking @process callback on the * ones which need an update. Since we don't want to hold any * locks during the task updates, gather tasks to be processed in a @@ -3408,8 +3409,8 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(cgrp, &it); - while ((p = cgroup_task_iter_next(&it))) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3443,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * the heap and wasn't inserted */ } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3485,7 +3486,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); + return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, + to, NULL); } /* @@ -3639,7 +3641,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3654,8 +3656,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3666,7 +3668,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3700,7 +3702,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; /* @@ -3714,8 +3716,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3735,7 +3737,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); err: return ret; -- cgit v1.2.1 From 81eeaf0411204f52af8ef78ff107cfca2fcfec1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cftype->[un]register_event() is among the remaining couple interfaces which still use struct cgroup. Convert it to cgroup_subsys_state. The conversion is mostly mechanical and removes the last users of mem_cgroup_from_cont() and cg_to_vmpressure(), which are removed. v2: indentation update as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c57301d0561..a71f2e0f9711 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -159,9 +159,9 @@ struct css_id { */ struct cgroup_event { /* - * Cgroup which the event belongs to. + * css which the event belongs to. */ - struct cgroup *cgrp; + struct cgroup_subsys_state *css; /* * Control file which the event associated. */ @@ -3955,11 +3955,12 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup *cgrp = event->cgrp; + struct cgroup_subsys_state *css = event->css; + struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(cgrp, event->cft, event->eventfd); + event->cft->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -3979,7 +3980,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; + struct cgroup *cgrp = event->css->cgroup; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -4048,7 +4049,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->cgrp = cgrp; + event->css = css; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4099,7 +4100,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(cgrp, event->cft, + ret = event->cft->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; -- cgit v1.2.1 From d99c8727e7bbc01b70e2c57e6127bfab26b868fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make cgroup_taskset deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cgroup_taskset which is used by the subsystem attach methods is the last cgroup subsystem API which isn't using css as the handle. Update cgroup_taskset_cur_cgroup() to cgroup_taskset_cur_css() and cgroup_taskset_for_each() to take @skip_css instead of @skip_cgrp. The conversions are pretty mechanical. One exception is cpuset::cgroup_cs(), which lost its last user and got removed. This patch shouldn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Daniel Wagner Cc: Ingo Molnar Cc: Matt Helsley Cc: Steven Rostedt --- kernel/cgroup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a71f2e0f9711..e5bfb2a81dcb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1907,18 +1907,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * cgroup_taskset_cur_css - return the matching css for the current task * @tset: taskset of interest + * @subsys_id: the ID of the target subsystem * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). + * Return the css for the current (last returned) task of @tset for + * subsystem specified by @subsys_id. This function must be preceded by + * either cgroup_taskset_first() or cgroup_taskset_next(). */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id) { - return tset->cur_cgrp; + return cgroup_css(tset->cur_cgrp, subsys_id); } -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); /** * cgroup_taskset_size - return the number of tasks in taskset -- cgit v1.2.1 From 95109b627ba6a043c181fa5fa45d1c754dd44fbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: unexport cgroup_css() cgroup_css() no longer has any user left outside cgroup.c proper and we don't want subsystems to grow new usages of the function. cgroup core should always provide the css to use to the subsystems, which will make dynamic creation and destruction of css's across the lifetime of a cgroup much more manageable than exposing the cgroup directly to subsystems and let them dereference css's from it. Make cgroup_css() a static function in cgroup.c. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5bfb2a81dcb..c02a288a4e3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -222,6 +222,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @subsys_id: the subsystem of interest + * + * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + int subsys_id) +{ + return cgrp->subsys[subsys_id]; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { -- cgit v1.2.1 From bd8815a6d802fc16a7a106e170593aa05dc17e72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make css_for_each_descendant() and friends include the origin css in the iteration Previously, all css descendant iterators didn't include the origin (root of subtree) css in the iteration. The reasons were maintaining consistency with css_for_each_child() and that at the time of introduction more use cases needed skipping the origin anyway; however, given that css_is_descendant() considers self to be a descendant, omitting the origin css has become more confusing and looking at the accumulated use cases rather clearly indicates that including origin would result in simpler code overall. While this is a change which can easily lead to subtle bugs, cgroup API including the iterators has recently gone through major restructuring and no out-of-tree changes will be applicable without adjustments making this a relatively acceptable opportunity for this type of change. The conversions are mostly straight-forward. If the iteration block had explicit origin handling before or after, it's moved inside the iteration. If not, if (pos == origin) continue; is added. Some conversions add extra reference get/put around origin handling by consolidating origin handling and the rest. While the extra ref operations aren't strictly necessary, this shouldn't cause any noticeable difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Michal Hocko Cc: Jens Axboe Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c02a288a4e3d..52f0498db946 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,17 +2868,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) mutex_unlock(&cgroup_mutex); - /* @root always needs to be updated */ - inode = root->dentry->d_inode; - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - - if (ret) - goto out_deact; - /* add/rm files for all cgroups created before */ rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { @@ -2907,7 +2896,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) } rcu_read_unlock(); dput(prev); -out_deact: deactivate_super(sb); return ret; } @@ -3099,7 +3087,8 @@ EXPORT_SYMBOL_GPL(css_next_child); * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant - * to visit for pre-order traversal of @root's descendants. + * to visit for pre-order traversal of @root's descendants. @root is + * included in the iteration and the first node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3114,9 +3103,9 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @root */ + /* if first iteration, visit @root */ if (!pos) - pos = root; + return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); @@ -3186,7 +3175,8 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant - * to visit for post-order traversal of @root's descendants. + * to visit for post-order traversal of @root's descendants. @root is + * included in the iteration and the last node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3207,14 +3197,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return next != root ? next : NULL; } + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, css_parent(pos)); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = css_parent(pos); - return next != root ? next : NULL; + return css_parent(pos); } EXPORT_SYMBOL_GPL(css_next_descendant_post); -- cgit v1.2.1 From 40e93b39cd5b6a347333a95152ce37deef37bbd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:53 -0400 Subject: cgroup: always use cgroup_css() cgroup_css() is the accessor for cgroup->subsys[] but is not used consistently. cgroup->subsys[] will become RCU protected and cgroup_css() will grow synchronization sanity checks. In preparation, make all cgroup->subsys[] dereferences use cgroup_css() consistently. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 58 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52f0498db946..49ad96ee08e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -574,7 +574,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgrp->subsys[i]; + template[i] = cgroup_css(cgrp, i); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -871,7 +871,7 @@ static void cgroup_free_fn(struct work_struct *work) * Release the subsystem state objects. */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); ss->css_free(css); } @@ -1067,27 +1067,27 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgrp->subsys[i]); - BUG_ON(!cgroup_dummy_top->subsys[i]); - BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, i)); + BUG_ON(!cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; + cgroup_css(cgrp, i)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp->subsys[i]); + ss->bind(cgroup_css(cgrp, i)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top->subsys[i]); - cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; + ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -2072,7 +2072,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2114,7 +2114,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->attach) ss->attach(css, &tset); @@ -2136,7 +2136,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss == failed_ss) break; @@ -2308,7 +2308,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); if (cft->ss) - return cgrp->subsys[cft->ss->subsys_id]; + return cgroup_css(cgrp, cft->ss->subsys_id); return &cgrp->dummy_css; } @@ -4241,7 +4241,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4285,7 +4285,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->id = NULL; if (cgrp == cgroup_dummy_top) css->flags |= CSS_ROOT; - BUG_ON(cgrp->subsys[ss->subsys_id]); + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; /* @@ -4300,7 +4300,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4315,7 +4315,7 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); lockdep_assert_held(&cgroup_mutex); @@ -4400,7 +4400,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(parent->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4477,7 +4477,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4590,7 +4590,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); /* * Killing would put the base ref, but we need to keep it @@ -4676,7 +4676,7 @@ static void cgroup_offline_fn(struct work_struct *work) * destruction happens only after all css's are released. */ for_each_root_subsys(cgrp->root, ss) - css_put(cgrp->subsys[ss->subsys_id]); + css_put(cgroup_css(cgrp, ss->subsys_id)); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); @@ -4741,7 +4741,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4820,7 +4820,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); + ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5562,8 +5562,8 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; + parent_css = cgroup_css(parent, subsys_id); + child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; @@ -5624,7 +5624,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) /* get cgroup */ cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; + css = cgroup_css(cgrp, id); return css ? css : ERR_PTR(-ENOENT); } -- cgit v1.2.1 From 35ef10da65d43211f4cd7e7822cbb3becdfc0ae1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: rename cgroup_subsys_state->dput_work and its callback function css (cgroup_subsys_state) will become RCU protected and there will be two stages which require punting to work item during release. To prepare for using the work item for multiple times, rename css->dput_work to css->destroy_work and css_dput_fn() to css_free_work_fn() and move work item initialization from css init to right before the actual usage. This reorganization doesn't introduce any behavior change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49ad96ee08e1..0b280978f097 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4259,10 +4259,10 @@ err: return ret; } -static void css_dput_fn(struct work_struct *work) +static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, dput_work); + container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_dput(css->cgroup); } @@ -4272,7 +4272,14 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - schedule_work(&css->dput_work); + /* + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->destroy_work will be used to invoke + * dput() asynchronously from css_put(). + */ + INIT_WORK(&css->destroy_work, css_free_work_fn); + schedule_work(&css->destroy_work); } static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -4287,14 +4294,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; - - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->dput_work will be used to invoke - * dput() asynchronously from css_put(). - */ - INIT_WORK(&css->dput_work, css_dput_fn); } /* invoke ->css_online() on a new CSS and mark it online if successful */ -- cgit v1.2.1 From 0ae78e0bf10ac38ab53548e18383afc9997eca22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: add cgroup_subsys_state->parent With the planned unified hierarchy, css's (cgroup_subsys_state) will be RCU protected and allowed to be attached and detached dynamically over the course of a cgroup's lifetime. This means that css's will stay accessible after being detached from its cgroup - the matching pointer in cgroup->subsys[] cleared - for ref draining and RCU grace period. cgroup core still wants to guarantee that the parent css is never destroyed before its children and css_parent() always returns the parent regardless of the state of the child css as long as it's accessible. This patch makes css's hold onto their parents and adds css->parent so that the parent css is never detroyed before its children and can be determined without consulting the cgroups. cgroup->dummy_css is also updated to point to the parent dummy_css; however, it doesn't need to worry about object lifetime as the parent cgroup is already pinned by the child. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b280978f097..5c6dd7ed26a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4264,6 +4264,9 @@ static void css_free_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + if (css->parent) + css_put(css->parent); + cgroup_dput(css->cgroup); } @@ -4290,8 +4293,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; css->id = NULL; - if (cgrp == cgroup_dummy_top) + + if (cgrp->parent) + css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + else css->flags |= CSS_ROOT; + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; } @@ -4388,6 +4395,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->dentry = dentry; cgrp->parent = parent; + cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; if (notify_on_release(parent)) @@ -4436,9 +4444,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry */ - for_each_root_subsys(root, ss) + /* each css holds a ref to the cgroup's dentry and the parent css */ + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + dget(dentry); + percpu_ref_get(&css->parent->refcnt); + } /* hold a ref to the parent's dentry */ dget(parent->dentry); -- cgit v1.2.1 From b77d7b6088377998ebf65eaea5e51008c2d75e94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: cgroup_css_from_dir() now should be called with RCU read locked cgroup->subsys[] will become RCU protected and thus all cgroup_css() usages should either be under RCU read lock or cgroup_mutex. This patch updates cgroup_css_from_dir() which returns the matching cgroup_subsys_state given a directory file and subsys_id so that it requires RCU read lock and updates its sole user perf_cgroup_connect(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar --- kernel/cgroup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c6dd7ed26a7..cbb6314f1836 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5616,8 +5616,14 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/* - * get corresponding css from file open on cgroupfs directory +/** + * cgroup_css_from_dir - get corresponding css from file open on cgroup dir + * @f: directory file of interest + * @id: subsystem id of interest + * + * Must be called under RCU read lock. The caller is responsible for + * pinning the returned css if it needs to be accessed outside the RCU + * critical section. */ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) { @@ -5625,6 +5631,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; + WARN_ON_ONCE(!rcu_read_lock_held()); + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) -- cgit v1.2.1 From 105347ba5da3e87facce2337c50cd5df93cc6bec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses, and cgroup_css() will soon require either holding cgroup_mutex or RCU read lock. This patch updates cgroup_file_open() such that it acquires the associated css under rcu_read_lock(). While cgroup_file_css() usages in other file operations are safe due to the reference from open, cgroup_css() wouldn't know that and will still trigger warnings. It'd be cleanest to store the acquired css in file->prvidate_data for further file operations but that's already used by seqfile. This patch instead adds cfent->css to cache the associated css. Note that while this field is initialized during cfe init, it should only be considered valid while the file is open. This patch doesn't change visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cbb6314f1836..d63beffd41e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -117,6 +117,7 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + struct cgroup_subsys_state *css; /* file xattrs */ struct simple_xattrs xattrs; @@ -2301,17 +2302,6 @@ static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, return 0; } -/* return the css for the given cgroup file */ -static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) -{ - struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - - if (cft->ss) - return cgroup_css(cgrp, cft->ss->subsys_id); - return &cgrp->dummy_css; -} - /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2388,7 +2378,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->write) return cft->write(css, cft, file, buf, nbytes, ppos); @@ -2430,7 +2420,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read) return cft->read(css, cft, file, buf, nbytes, ppos); @@ -2456,7 +2446,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read_map) { struct cgroup_map_cb cb = { @@ -2479,7 +2469,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css; int err; err = generic_file_open(inode, file); @@ -2491,7 +2482,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css->ss && !css_tryget(css)) + rcu_read_lock(); + if (cft->ss) { + css = cgroup_css(cgrp, cft->ss->subsys_id); + if (!css_tryget(css)) + css = NULL; + } else { + css = &cgrp->dummy_css; + } + rcu_read_unlock(); + + /* css should match @cfe->css, see cgroup_add_file() for details */ + if (!css || WARN_ON_ONCE(css != cfe->css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2510,7 +2512,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; int ret = 0; if (cft->release) @@ -2772,6 +2774,18 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); + /* + * cfe->css is used by read/write/close to determine the associated + * css. file->private_data would be a better place but that's + * already used by seqfile. Note that open will use the usual + * cgroup_css() and css_tryget() to acquire the css and this + * caching doesn't affect css lifetime management. + */ + if (cft->ss) + cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); + else + cfe->css = &cgrp->dummy_css; + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit v1.2.1 From 73e80ed8007fc48a6deeb295ba37159fad274bd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: add __rcu modifier to cgroup->subsys[] For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses. Previous changes ensured that all cgroup->subsys[] accesses use the cgroup_css() accessor. This patch adds __rcu modifier to cgroup->subsys[], add matching RCU dereference in cgroup_css() and convert all assignments to either rcu_assign_pointer() or RCU_INIT_POINTER(). This change prepares for the actual RCUfication of css's and doesn't introduce any visible behavior change. The conversion is verified with sparse and all accesses are properly RCU annotated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d63beffd41e1..c27101622567 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -229,11 +229,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], * @subsys_id: the subsystem of interest * * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + * This function must be called either under cgroup_mutex or + * rcu_read_lock() and the caller is responsible for pinning the returned + * css if it wants to keep accessing it outside the said locks. This + * function may return %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, int subsys_id) { - return cgrp->subsys[subsys_id]; + return rcu_dereference_check(cgrp->subsys[subsys_id], + lockdep_is_held(&cgroup_mutex)); } /* convenient tests for these bits */ @@ -1072,8 +1077,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(!cgroup_css(cgroup_dummy_top, i)); BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); - cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; + rcu_assign_pointer(cgrp->subsys[i], + cgroup_css(cgroup_dummy_top, i)); cgroup_css(cgrp, i)->cgroup = cgrp; + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) @@ -1088,8 +1095,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; - cgrp->subsys[i] = NULL; + RCU_INIT_POINTER(cgrp->subsys[i], NULL); + cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -4314,7 +4323,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - cgrp->subsys[ss->subsys_id] = css; + rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4962,7 +4971,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * also takes care of freeing the css_id. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); - cgroup_dummy_top->subsys[ss->subsys_id] = NULL; + RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); } -- cgit v1.2.1 From 623f926b050e12b0f5e3a2f4d11c36e4ddd63541 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: reorganize css init / exit paths css (cgroup_subsys_state) lifetime management is about to be restructured. In prepartion, make the following mostly trivial changes. * init_cgroup_css() is renamed to init_css() so that it's consistent with other css handling functions. * alloc_css_id(), online_css() and offline_css() updated to take @css instead of cgroups and subsys IDs. This patch doesn't make any functional changes. v2: v1 merged two for_each_root_subsys() loops in cgroup_create() but Li Zefan pointed out that it breaks error path. Dropped. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c27101622567..a1ebc445f350 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -838,8 +838,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); +static int alloc_css_id(struct cgroup_subsys_state *child_css); static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { @@ -4308,9 +4307,8 @@ static void css_release(struct percpu_ref *ref) schedule_work(&css->destroy_work); } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, + struct cgroup *cgrp) { css->cgroup = cgrp; css->ss = ss; @@ -4327,9 +4325,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* invoke ->css_online() on a new CSS and mark it online if successful */ -static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static int online_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4342,9 +4340,9 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ -static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void offline_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); @@ -4442,10 +4440,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; } - init_cgroup_css(css, ss, cgrp); + init_css(css, ss, cgrp); if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); + err = alloc_css_id(css); if (err) goto err_free_all; } @@ -4480,7 +4478,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - err = online_css(ss, cgrp); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + + err = online_css(css); if (err) goto err_destroy; @@ -4700,7 +4700,7 @@ static void cgroup_offline_fn(struct work_struct *work) * initate destruction. */ for_each_root_subsys(cgrp->root, ss) - offline_css(ss, cgrp); + offline_css(cgroup_css(cgrp, ss->subsys_id)); /* * Put the css refs from cgroup_destroy_locked(). Each css holds @@ -4778,7 +4778,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, cgroup_dummy_top); + init_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4793,7 +4793,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, cgroup_dummy_top)); + BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); mutex_unlock(&cgroup_mutex); @@ -4866,8 +4866,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_cgroup_css because it sets css->id. */ + init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); if (ret) @@ -4897,7 +4897,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(ss, cgroup_dummy_top); + ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ret) goto err_unload; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(ss, cgroup_dummy_top); + offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5588,20 +5588,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return 0; } -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) +static int alloc_css_id(struct cgroup_subsys_state *child_css) { - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; + struct cgroup_subsys_state *parent_css = css_parent(child_css); struct css_id *child_id, *parent_id; + int i, depth; - subsys_id = ss->subsys_id; - parent_css = cgroup_css(parent, subsys_id); - child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; - child_id = get_new_cssid(ss, depth); + child_id = get_new_cssid(child_css->ss, depth); if (IS_ERR(child_id)) return PTR_ERR(child_id); -- cgit v1.2.1 From ae7f164a09408bf21ab3c82a9e80a3ff37aa9e3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: move cgroup->subsys[] assignment to online_css() Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. In preparation, this patch moves cgroup->subsys[] assignment from init_css() to online_css(). As this means that a newly initialized css should be remembered separately and that cgroup_css() returns NULL between init and online, cgroup_create() is updated so that it stores newly created css's in a local array css_ar[] and cgroup_init/load_subsys() are updated to use local variable @css instead of using cgroup_css(). This change also slightly simplifies error path of cgroup_create(). While this patch changes when cgroup->subsys[] is initialized, this change isn't visible to subsystems or userland. v2: This patch wasn't updated accordingly after the previous "cgroup: reorganize css init / exit paths" was updated leading to missing a css_ar[] conversion in cgroup_create() and thus boot failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1ebc445f350..b9f736c3b36d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4321,7 +4321,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4334,8 +4333,10 @@ static int online_css(struct cgroup_subsys_state *css) if (ss->css_online) ret = ss->css_online(css); - if (!ret) + if (!ret) { css->flags |= CSS_ONLINE; + rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + } return ret; } @@ -4366,6 +4367,7 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4433,12 +4435,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + css_ar[ss->subsys_id] = css; err = percpu_ref_init(&css->refcnt, css_release); - if (err) { - ss->css_free(css); + if (err) goto err_free_all; - } init_css(css, ss, cgrp); @@ -4467,7 +4468,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* each css holds a ref to the cgroup's dentry and the parent css */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); percpu_ref_get(&css->parent->refcnt); @@ -4478,7 +4479,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; err = online_css(css); if (err) @@ -4511,7 +4512,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4793,7 +4794,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); + BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); @@ -4897,7 +4898,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ret = online_css(css); if (ret) goto err_unload; -- cgit v1.2.1 From 223dbc38d2a8745a93749dc75ed909e274ce075d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: bounce cgroup_subsys_state ref kill confirmation to a work item css (cgroup_subsys_state) offlining, which requires process context, will be moved to ref kill confirmation. In preparation, bounce css_killed handling through css->destroy_work. css_ref_killed_fn() is renamed to css_killed_ref_fn() so that it's consistent with the new css_killed_work_fn(). This patch adds an additional work item bouncing but doesn't change the actual logic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9f736c3b36d..398ffbbee32f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4555,12 +4555,27 @@ static void cgroup_css_killed(struct cgroup *cgrp) schedule_work(&cgrp->destroy_work); } -static void css_ref_killed_fn(struct percpu_ref *ref) +/* + * This is called when the refcnt of a css is confirmed to be killed. + * css_tryget() is now guaranteed to fail. + */ +static void css_killed_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; + + cgroup_css_killed(cgrp); +} + +/* css kill confirmation processing requires process context, bounce */ +static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - cgroup_css_killed(css->cgroup); + INIT_WORK(&css->destroy_work, css_killed_work_fn); + schedule_work(&css->destroy_work); } /** @@ -4634,7 +4649,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) percpu_ref_get(&css->refcnt); atomic_inc(&cgrp->css_kill_cnt); - percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } cgroup_css_killed(cgrp); -- cgit v1.2.1 From f20104de55a212a9742d8df1807f1f29dc95b748 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: replace cgroup->css_kill_cnt with ->nr_css Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. cgroup->css_kill_cnt is used during cgroup destruction to wait for css reference count disable; however, this model doesn't work once css's lifetimes are managed separately from cgroup's. This patch replaces it with cgroup->nr_css which is an cgroup_mutex protected integer counting the number of attached css's. The count is incremented from online_css() and decremented after refcnt kill is confirmed. If the count reaches zero and the cgroup is marked dead, the second stage of cgroup destruction is kicked off. If a cgroup doesn't have any css attached at the time of rmdir, cgroup_destroy_locked() now invokes the second stage directly as no css kill confirmation would happen. cgroup_offline_fn() - the second step of cgroup destruction - is renamed to cgroup_destroy_css_killed() and now expects to be called with cgroup_mutex held. While this patch changes how css destruction is punted to work items, it shouldn't change any visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 398ffbbee32f..174f4c3d72ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,7 +218,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; -static void cgroup_offline_fn(struct work_struct *work); +static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -4335,6 +4335,7 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; + css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); } return ret; @@ -4545,16 +4546,6 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static void cgroup_css_killed(struct cgroup *cgrp) -{ - if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) - return; - - /* percpu ref's of all css's are killed, kick off the next step */ - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget() is now guaranteed to fail. @@ -4565,7 +4556,17 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - cgroup_css_killed(cgrp); + mutex_lock(&cgroup_mutex); + + /* + * If @cgrp is marked dead, it's waiting for refs of all css's to + * be disabled before proceeding to the second phase of cgroup + * destruction. If we are the last one, kick it off. + */ + if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + cgroup_destroy_css_killed(cgrp); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -4634,11 +4635,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. The * notification callback keeps track of the number of css's to be - * killed and schedules cgroup_offline_fn() to perform the rest of - * destruction once the percpu refs of all css's are confirmed to - * be killed. + * killed and invokes cgroup_destroy_css_killed() to perform the + * rest of destruction once the percpu refs of all css's are + * confirmed to be killed. */ - atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4648,10 +4648,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ percpu_ref_get(&css->refcnt); - atomic_inc(&cgrp->css_kill_cnt); percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } - cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4668,6 +4666,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); + /* + * If @cgrp has css's attached, the second stage of cgroup + * destruction is kicked off from css_killed_work_fn() after the + * refs of all attached css's are killed. If @cgrp doesn't have + * any css, we kick it off here. + */ + if (!cgrp->nr_css) + cgroup_destroy_css_killed(cgrp); + /* * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. @@ -4693,7 +4700,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) }; /** - * cgroup_offline_fn - the second step of cgroup destruction + * cgroup_destroy_css_killed - the second step of cgroup destruction * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being @@ -4702,14 +4709,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * is the second step of destruction described in the comment above * cgroup_destroy_locked(). */ -static void cgroup_offline_fn(struct work_struct *work) +static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + lockdep_assert_held(&cgroup_mutex); /* * css_tryget() is guaranteed to fail now. Tell subsystems to @@ -4743,8 +4749,6 @@ static void cgroup_offline_fn(struct work_struct *work) set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - - mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -- cgit v1.2.1 From 09a503ea3a816b285b0b402b7f785eaec0c7a7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: decouple cgroup_subsys_state destruction from cgroup destruction Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. css's are created when the associated cgroup is created and destroyed when it gets destroyed. Also, individual css's aren't RCU protected but the whole cgroup is. With the planned unified hierarchy, css's will need to be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, this patch decouples css destruction from cgroup destruction - offline_css() invocation and the final css_put() are moved from cgroup_destroy_css_killed() to css_killed_work_fn(). Now each css is individually offlined and put as its reference count is killed instead of waiting for all css's attached to the cgroup to finish refcnt killing and then proceeding to offlining and putting them together. While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 ++++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 174f4c3d72ef..3c4c4b01ffe5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4355,6 +4355,7 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; + css->cgroup->nr_css--; } /* @@ -4558,15 +4559,30 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ + offline_css(css); + /* * If @cgrp is marked dead, it's waiting for refs of all css's to * be disabled before proceeding to the second phase of cgroup * destruction. If we are the last one, kick it off. */ - if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + if (!cgrp->nr_css && cgroup_is_dead(cgrp)) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + + /* + * Put the css refs from kill_css(). Each css holds an extra + * reference to the cgroup's dentry and cgroup removal proceeds + * regardless of css refs. On the last put of each css, whenever + * that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. + */ + css_put(css); } /* css kill confirmation processing requires process context, bounce */ @@ -4633,11 +4649,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. The - * notification callback keeps track of the number of css's to be - * killed and invokes cgroup_destroy_css_killed() to perform the - * rest of destruction once the percpu refs of all css's are - * confirmed to be killed. + * css is confirmed to be seen as killed on all CPUs. + * cgroup_destroy_css_killed() will be invoked to perform the rest + * of destruction once the percpu refs of all css's are confirmed + * to be killed. */ for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4704,36 +4719,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being - * destroyed after the percpu refcnts of all css's are guaranteed to be - * seen as killed on all CPUs, and performs the rest of destruction. This - * is the second step of destruction described in the comment above - * cgroup_destroy_locked(). + * destroyed after all css's are offlined and performs the rest of + * destruction. This is the second step of destruction described in the + * comment above cgroup_destroy_locked(). */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; - struct cgroup_subsys *ss; lockdep_assert_held(&cgroup_mutex); - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ - for_each_root_subsys(cgrp->root, ss) - offline_css(cgroup_css(cgrp, ss->subsys_id)); - - /* - * Put the css refs from cgroup_destroy_locked(). Each css holds - * an extra reference to the cgroup's dentry and cgroup removal - * proceeds regardless of css refs. On the last put of each css, - * whenever that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ - for_each_root_subsys(cgrp->root, ss) - css_put(cgroup_css(cgrp, ss->subsys_id)); - /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); -- cgit v1.2.1 From edae0c3358947f8be5ca99f762d89e0c38e1f5d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: factor out kill_css() Factor out css ref killing from cgroup_destroy_locked() into kill_css(). We're gonna add more to the path and the factored out function will eventually be called from other places too. While at it, replace open coded percpu_ref_get() with css_get() for consistency. This shouldn't cause any functional difference as the function is not used for root cgroups. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 58 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3c4c4b01ffe5..7b7575f3119c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4595,6 +4595,36 @@ static void css_killed_ref_fn(struct percpu_ref *ref) schedule_work(&css->destroy_work); } +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by putting its base + * reference. ->css_offline() will be invoked asynchronously once + * css_tryget() is guaranteed to fail and when the reference count reaches + * zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + /* + * Killing would put the base ref, but we need to keep it alive + * until after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is + * invoked, no new css reference will be given out via + * css_tryget(). We can't simply call percpu_ref_kill() and + * proceed to offlining css's because percpu_ref_kill() doesn't + * guarantee that the ref is seen as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4641,30 +4671,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by killing css refcnts. cgroup core - * guarantees that, by the time ->css_offline() is invoked, no new - * css reference will be given out via css_tryget(). We can't - * simply call percpu_ref_kill() and proceed to offlining css's - * because percpu_ref_kill() doesn't guarantee that the ref is seen - * as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. - * cgroup_destroy_css_killed() will be invoked to perform the rest - * of destruction once the percpu refs of all css's are confirmed - * to be killed. + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - /* - * Killing would put the base ref, but we need to keep it - * alive until after ->css_offline. - */ - percpu_ref_get(&css->refcnt); - - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); - } + for_each_root_subsys(cgrp->root, ss) + kill_css(cgroup_css(cgrp, ss->subsys_id)); /* * Mark @cgrp dead. This prevents further task migration and child -- cgit v1.2.1 From 3c14f8b44fafaa60519440bea1591e495b928327 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: move subsys file removal to kill_css() With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. This patch moves subsys file removal from cgroup_destroy_locked() to kill_css(). While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b7575f3119c..3137e38995b0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4599,13 +4599,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * kill_css - destroy a css * @css: css to destroy * - * This function initiates destruction of @css by putting its base - * reference. ->css_offline() will be invoked asynchronously once - * css_tryget() is guaranteed to fail and when the reference count reaches - * zero, @css will be released. + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). @@ -4703,10 +4705,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup_destroy_css_killed(cgrp); /* - * Clear and remove @cgrp directory. The removal puts the base ref - * but we aren't quite done with @cgrp yet, so hold onto it. + * Clear the base files and remove @cgrp directory. The removal + * puts the base ref but we aren't quite done with @cgrp yet, so + * hold onto it. */ - cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit v1.2.1 From 0c21ead136a900c36f1ab74fd7d09a306dc31324 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: RCU protect each cgroup_subsys_state release With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. Most of the destruction path has been decoupled but the actual free of css still depends on cgroup free path. When all css refs are drained, css_release() kicks off css_free_work_fn() which puts the cgroup. When the cgroup refcnt reaches zero, cgroup_diput() is invoked which in turn schedules RCU free of the cgroup. After a grace period, all css's are freed along with the cgroup itself. This patch moves the RCU grace period and css freeing from cgroup release path to css release path. css_release(), instead of kicking off css_free_work_fn() directly, schedules RCU callback css_free_rcu_fn() which in turn kicks off css_free_work_fn() after a RCU grace period. css_free_work_fn() is updated to free the css directly. The five-way punting - percpu ref kill confirmation, a work item, percpu ref release, RCU grace period, and again a work item - is quite hairy but the work items are there only to provide process context and the actual sequence is kill confirm -> release -> RCU free, which isn't simple but not too crazy. This removes cgroup_css() usage after offline_css() allowing clearing cgroup->subsys[] from offline_css(), which makes it consistent with online_css() and brings it closer to proper lifetime management for individual css's. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 53 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 16 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3137e38995b0..66d01078eebe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -869,18 +869,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - ss->css_free(css); - } - cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -4281,32 +4271,62 @@ err: return ret; } +/* + * css destruction is four-stage process. + * + * 1. Destruction starts. Killing of the percpu_ref is initiated. + * Implemented in kill_css(). + * + * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs + * and thus css_tryget() is guaranteed to fail, the css can be offlined + * by invoking offline_css(). After offlining, the base ref is put. + * Implemented in css_killed_work_fn(). + * + * 3. When the percpu_ref reaches zero, the only possible remaining + * accessors are inside RCU read sections. css_release() schedules the + * RCU callback. + * + * 4. After the grace period, the css can be freed. Implemented in + * css_free_work_fn(). + * + * It is actually hairier because both step 2 and 4 require process context + * and thus involve punting to css->destroy_work adding two additional + * steps to the already complex sequence. + */ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; if (css->parent) css_put(css->parent); - cgroup_dput(css->cgroup); + css->ss->css_free(css); + cgroup_dput(cgrp); } -static void css_release(struct percpu_ref *ref) +static void css_free_rcu_fn(struct rcu_head *rcu_head) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(rcu_head, struct cgroup_subsys_state, rcu_head); /* * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->destroy_work will be used to invoke - * dput() asynchronously from css_put(). + * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); schedule_work(&css->destroy_work); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -4356,6 +4376,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; + RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } /* -- cgit v1.2.1 From 930913a31289202d232186b82854b26d7fb7cf4d Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 16 Aug 2013 17:57:14 +0800 Subject: cgroup: use css_get() in cgroup_create() to check CSS_ROOT It seems that the root css doesn't have refcnt allocated(not needed?), and would cause the booting error attached. This patch tries to use css_get() to not increase the refcnt if parent is root. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cgroup_mkdir+0x37c/0x740 PGD 0 Oops: 0002 [#1] Modules linked in: CPU: 0 PID: 1 Comm: systemd Not tainted 3.11.0-rc5-next-20130815+ #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 task: ffff88007f868000 ti: ffff88007f864000 task.ti: ffff88007f864000 RIP: 0010:[] [] cgroup_mkdir+0x37c/0x740 RSP: 0018:ffff88007f865df8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffff81a46ee0 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81a415c0 RBP: ffff88007f865ec8 R08: 0000000000000001 R09: 0000000000000000 R10: ffff88007ce6d060 R11: 0000000000000000 R12: ffff88007ce6d000 R13: ffff88007ce6d060 R14: ffffffff81a46d80 R15: ffff88007c6e8018 FS: 00007f13dbf6f840(0000) GS:ffffffff81a23000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007b7e5000 CR4: 00000000000006b0 Stack: ffffffff810b380d 0000000000000002 ffff88007f865e18 ffffffff81167069 ffff88007f865ed8 ffffffff8116a3f5 ffff880037454400 ffff88007c6e8018 ffff88007c6e8028 ffff88007c6e8328 ffff88007c6e8000 ffff88007ce6d000 Call Trace: [] ? cgroup_mkdir+0x3bd/0x740 [] ? lookup_hash+0x19/0x20 [] ? kern_path_create+0x95/0x170 [] vfs_mkdir+0x9e/0xf0 [] SyS_mkdirat+0x60/0xe0 [] SyS_mkdir+0x19/0x20 [] tracesys+0xcf/0xd4 Code: ad 70 ff ff ff 48 89 9d 60 ff ff ff 4d 89 d5 4c 8b bd 68 ff ff ff 4c 8b 65 88 eb 50 0f 1f 00 48 8b 43 18 a8 03 0f 85 6c 03 00 00 00 e8 1d 0a fb ff 85 c0 74 0d 80 3d f0 45 a1 00 00 0f 84 4c RIP [] cgroup_mkdir+0x37c/0x740 RSP CR2: 0000000000000000 ---[ end trace a4b14b49bc46fd60 ]--- Signed-off-by: Li Zhong Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66d01078eebe..b69b572131e5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4494,7 +4494,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); - percpu_ref_get(&css->parent->refcnt); + css_get(css->parent); } /* hold a ref to the parent's dentry */ -- cgit v1.2.1 From 1cb650b91ba582f6737457b7d22e368585596d2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 19 Aug 2013 10:05:24 +0800 Subject: cgroup: change cgroup_from_id() to css_from_id() Now we want cgroup core to always provide the css to use to the subsystems, so change this API to css_from_id(). Uninline css_from_id(), because it's getting bigger and cgroup_css() has been unexported. While at it, remove the #ifdef, and shuffle the order of the args. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b69b572131e5..ff7d642a070a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5717,6 +5717,28 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) return css ? css : ERR_PTR(-ENOENT); } +/** + * css_from_id - lookup css by id + * @id: the cgroup id + * @ss: cgroup subsys to be looked into + * + * Returns the css if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) +{ + struct cgroup *cgrp; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "css_from_id() needs proper protection"); + + cgrp = idr_find(&ss->root->cgroup_idr, id); + if (cgrp) + return cgroup_css(cgrp, ss->subsys_id); + return NULL; +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v1.2.1 From 0bfb4aa67cef4982adc70590a31624d7b35a0bda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:42:36 -0400 Subject: cgroup: fix subsystem file accesses on the root cgroup 105347ba5 ("cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css") added cfent->css to cache the associted cgroup_subsys_state across file operations. A cfent is associated with single css throughout its lifetime and the origimal commit initialized the cache pointer during cgroup_add_file() and verified that it matches the actual one in cgroup_file_open(). While this works fine for !root cgroups, it's broken for root cgroups as files in a root cgroup are created before the css's are associated with the cgroup and thus cgroup_css() call in cgroup_add_file() returns NULL associating all cfents in the root cgroup with NULL css. This makes cgroup_file_open() trigger WARN and fail with -ENODEV for all !core subsystem files in the root cgroups. There's no reason to initialize cfent->css separately from cgroup_add_file(). As the association never changes, cgroup_file_open() can set it unconditionally every time and containing the logic in cgroup_file_open() makes more sense anyway as the only reason it's necessary is file->private_data being already occupied. Fix it by setting cfent->css unconditionally from cgroup_file_open(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ff7d642a070a..896e035eb6e4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2490,10 +2490,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) } rcu_read_unlock(); - /* css should match @cfe->css, see cgroup_add_file() for details */ - if (!css || WARN_ON_ONCE(css != cfe->css)) + if (!css) return -ENODEV; + /* + * @cfe->css is used by read/write/close to determine the + * associated css. @file->private_data would be a better place but + * that's already used by seqfile. Multiple accessors may use it + * simultaneously which is okay as the association never changes. + */ + WARN_ON_ONCE(cfe->css && cfe->css != css); + cfe->css = css; + if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; err = single_open(file, cgroup_seqfile_show, cfe); @@ -2772,18 +2780,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); - /* - * cfe->css is used by read/write/close to determine the associated - * css. file->private_data would be a better place but that's - * already used by seqfile. Note that open will use the usual - * cgroup_css() and css_tryget() to acquire the css and this - * caching doesn't affect css lifetime management. - */ - if (cft->ss) - cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); - else - cfe->css = &cgrp->dummy_css; - mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit v1.2.1 From 6e6eab0efdf48fb2d8d7aee904d7740acb4661c6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:43:15 -0400 Subject: cgroup: fix cgroup_write_event_control() 81eeaf0411 ("cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state inst ead of cgroup") updated the cftype event methods to take @css (cgroup_subsys_state) instead of @cgroup; however, it incorrectly used @css passed to cgroup_write_event_control(), which the dummy_css for the cgroup as the file is a cgroup core file. This leads to oops on event registration. Fix it by using the css matching the event target file. Note that cgroup_write_event_control() now disallows cgroup core files from being event sources. This is for simplicity and doesn't matter as cgroup_event will be moved and made specific to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 896e035eb6e4..ef43e3f453ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4040,10 +4040,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *css, +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4065,7 +4065,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->css = css; + INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4101,6 +4101,23 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; + } + + /* determine the css of @cfile and associate @event with it */ + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + if (event->css) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + /* * The file to be monitored must be in the same cgroup as * cgroup.event_control is. @@ -4116,7 +4133,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(css, event->cft, + ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; -- cgit v1.2.1 From 35cf083619da5677f83e9a8eae813f0b413d7082 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: rename cgroup_css_from_dir() to css_from_dir() and update its syntax cgroup_css_from_dir() will grow another user. In preparation, make the following changes. * All css functions are prefixed with just "css_", rename it to css_from_dir(). * Take dentry * instead of file * as dentry is what ultimately identifies a cgroup and file may not always be available. Note that the function now checkes whether @dentry->d_inode is NULL as the caller now may specify a negative dentry. * Make it take cgroup_subsys * instead of integer subsys_id. This simplifies the function and allows specifying no subsystem for cgroup->dummy_css. * Make return section a bit less verbose. This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar --- kernel/cgroup.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef43e3f453ef..921b1387c944 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5700,34 +5700,28 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) EXPORT_SYMBOL_GPL(css_lookup); /** - * cgroup_css_from_dir - get corresponding css from file open on cgroup dir - * @f: directory file of interest - * @id: subsystem id of interest + * css_from_dir - get corresponding css from the dentry of a cgroup dir + * @dentry: directory dentry of interest + * @ss: subsystem of interest * * Must be called under RCU read lock. The caller is responsible for * pinning the returned css if it needs to be accessed outside the RCU * critical section. */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; WARN_ON_ONCE(!rcu_read_lock_held()); - inode = file_inode(f); - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) + /* is @dentry a cgroup dir? */ + if (!dentry->d_inode || + dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); - - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgroup_css(cgrp, id); - return css ? css : ERR_PTR(-ENOENT); + cgrp = __d_cgrp(dentry); + return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); } /** -- cgit v1.2.1 From ca8bdcaff0d77990fb69e0f946018c96a70851cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys cgroup_css() is no longer used in hot paths. Make it take struct cgroup_subsys * and allow the users to specify NULL subsys to obtain the dummy_css. This removes open-coded NULL subsystem testing in a couple users and generally simplifies the code. After this patch, css_from_dir() also allows NULL @ss and returns the matching dummy_css. This behavior change doesn't affect its only user - perf. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 90 +++++++++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 47 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 921b1387c944..7516668d8325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,19 +226,22 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @subsys_id: the subsystem of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) * - * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. - * This function must be called either under cgroup_mutex or - * rcu_read_lock() and the caller is responsible for pinning the returned - * css if it wants to keep accessing it outside the said locks. This - * function may return %NULL if @cgrp doesn't have @subsys_id enabled. + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - int subsys_id) + struct cgroup_subsys *ss) { - return rcu_dereference_check(cgrp->subsys[subsys_id], - lockdep_is_held(&cgroup_mutex)); + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->dummy_css; } /* convenient tests for these bits */ @@ -580,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgroup_css(cgrp, i); + template[i] = cgroup_css(cgrp, ss); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -1062,30 +1065,30 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, i)); - BUG_ON(!cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, ss)); + BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, i)); - cgroup_css(cgrp, i)->cgroup = cgrp; + cgroup_css(cgroup_dummy_top, ss)); + cgroup_css(cgrp, ss)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgroup_css(cgrp, i)); + ss->bind(cgroup_css(cgrp, ss)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, i)); + ss->bind(cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; + cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -1930,7 +1933,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_next); struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, int subsys_id) { - return cgroup_css(tset->cur_cgrp, subsys_id); + return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); } EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); @@ -2071,7 +2074,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2113,7 +2116,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->attach) ss->attach(css, &tset); @@ -2135,7 +2138,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss == failed_ss) break; @@ -2481,13 +2484,9 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * @css stays alive for all file operations. */ rcu_read_lock(); - if (cft->ss) { - css = cgroup_css(cgrp, cft->ss->subsys_id); - if (!css_tryget(css)) - css = NULL; - } else { - css = &cgrp->dummy_css; - } + css = cgroup_css(cgrp, cft->ss); + if (cft->ss && !css_tryget(css)) + css = NULL; rcu_read_unlock(); if (!css) @@ -2878,7 +2877,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (cgroup_is_dead(cgrp)) @@ -3082,10 +3081,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, if (&next->sibling == &cgrp->children) return NULL; - if (parent_css->ss) - return cgroup_css(next, parent_css->ss->subsys_id); - else - return &next->dummy_css; + return cgroup_css(next, parent_css->ss); } EXPORT_SYMBOL_GPL(css_next_child); @@ -4110,7 +4106,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, rcu_read_lock(); ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + event->css = cgroup_css(cgrp, event->cft->ss); if (event->css) ret = 0; @@ -4266,7 +4262,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4349,11 +4345,11 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->id = NULL; if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + css->parent = cgroup_css(cgrp->parent, ss); else css->flags |= CSS_ROOT; - BUG_ON(cgroup_css(cgrp, ss->subsys_id)); + BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4466,7 +4462,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(parent, ss)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4712,7 +4708,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * percpu refs of all css's are confirmed to be killed. */ for_each_root_subsys(cgrp->root, ss) - kill_css(cgroup_css(cgrp, ss->subsys_id)); + kill_css(cgroup_css(cgrp, ss)); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4839,7 +4835,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_css(css, ss, cgroup_dummy_top); @@ -4918,7 +4914,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5000,7 +4996,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + offline_css(cgroup_css(cgroup_dummy_top, ss)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5034,7 +5030,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); @@ -5721,7 +5717,7 @@ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, return ERR_PTR(-EBADF); cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); + return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); } /** -- cgit v1.2.1 From 9fa4db334c7d9570aec7a5121e84fae99aae1d04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: implement CFTYPE_NO_PREFIX When cgroup files are created, cgroup core automatically prepends the name of the subsystem as prefix. This patch adds CFTYPE_NO_ which disables the automatic prefix. This is to work around historical baggages and shouldn't be used for new files. This will be used to move "cgroup.event_control" from cgroup core to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Glauber Costa --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7516668d8325..a41dc87cd07e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2756,7 +2756,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, cft->ss->name); strcat(name, "."); } -- cgit v1.2.1 From 7941cb027dccedec3c047271554ddcf4be2e0697 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_event hold onto cgroup_subsys_state instead of cgroup Currently, each registered cgroup_event holds an extra reference to the cgroup. This is a bit weird as events are subsystem specific and will also be incorrect in the planned unified hierarchy as css (cgroup_subsys_state) may come and go dynamically across the lifetime of a cgroup. Holding onto cgroup won't prevent the target css from going away. Update cgroup_event to hold onto the css the traget file belongs to instead of cgroup. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a41dc87cd07e..12237a291d88 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3969,7 +3969,6 @@ static void cgroup_event_remove(struct work_struct *work) struct cgroup_event *event = container_of(work, struct cgroup_event, remove); struct cgroup_subsys_state *css = event->css; - struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); @@ -3980,7 +3979,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); - cgroup_dput(cgrp); + css_put(css); } /* @@ -4103,12 +4102,16 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_cfile; } - /* determine the css of @cfile and associate @event with it */ + /* + * Determine the css of @cfile and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. + */ rcu_read_lock(); ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css) + if (event->css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); @@ -4122,28 +4125,21 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) - goto out_put_cfile; + goto out_put_css; efile->f_op->poll(efile, &event->pt); - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); - spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); @@ -4153,6 +4149,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, return 0; +out_put_css: + css_put(event->css); out_put_cfile: fput(cfile); out_put_eventfd: -- cgit v1.2.1 From 7c918cbbd829669bf70ffcc45962d5d992942243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_write_event_control() use css_from_dir() instead of __d_cgrp() cgroup_event will be moved to its only user - memcg. Replace __d_cgrp() usage with css_from_dir(), which is already exported. This also simplifies the code a bit. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 12237a291d88..e76698dd6c08 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4041,7 +4041,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, { struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; - struct cgroup *cgrp_cfile; + struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct file *efile; struct file *cfile; @@ -4103,7 +4103,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, } /* - * Determine the css of @cfile and associate @event with it. + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. * Remaining events are automatically removed on cgroup destruction * but the removal is asynchronous, so take an extra ref. */ @@ -4111,23 +4112,14 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css && css_tryget(event->css)) + cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); if (ret) goto out_put_cfile; - /* - * The file to be monitored must be in the same cgroup as - * cgroup.event_control is. - */ - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); - if (cgrp_cfile != cgrp) { - ret = -EINVAL; - goto out_put_css; - } - if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto out_put_css; -- cgit v1.2.1 From d1625964da51bda61306ad3ec45307a799c21f08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Aug 2013 14:27:23 -0400 Subject: cgroup: fix cgroup_css() invocation in css_from_id() ca8bdcaff0 ("cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys") missed one conversion in css_from_id(), which was newly added. As css_from_id() doesn't have any user yet, this doesn't break anything other than generating a build warning. Convert it. Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Reported-by: kbuild test robot --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e76698dd6c08..b5f4989937f2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5729,7 +5729,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) - return cgroup_css(cgrp, ss->subsys_id); + return cgroup_css(cgrp, ss); return NULL; } -- cgit v1.2.1