diff options
Diffstat (limited to 'include/linux/cgroup-defs.h')
-rw-r--r-- | include/linux/cgroup-defs.h | 88 |
1 files changed, 82 insertions, 6 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ed128fed0335..9dc226345e4e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -544,31 +544,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #ifdef CONFIG_SOCK_CGROUP_DATA +/* + * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains + * per-socket cgroup information except for memcg association. + * + * On legacy hierarchies, net_prio and net_cls controllers directly set + * attributes on each sock which can then be tested by the network layer. + * On the default hierarchy, each sock is associated with the cgroup it was + * created in and the networking layer can match the cgroup directly. + * + * To avoid carrying all three cgroup related fields separately in sock, + * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. + * On boot, sock_cgroup_data records the cgroup that the sock was created + * in so that cgroup2 matches can be made; however, once either net_prio or + * net_cls starts being used, the area is overriden to carry prioidx and/or + * classid. The two modes are distinguished by whether the lowest bit is + * set. Clear bit indicates cgroup pointer while set bit prioidx and + * classid. + * + * While userland may start using net_prio or net_cls at any time, once + * either is used, cgroup2 matching no longer works. There is no reason to + * mix the two and this is in line with how legacy and v2 compatibility is + * handled. On mode switch, cgroup references which are already being + * pointed to by socks may be leaked. While this can be remedied by adding + * synchronization around sock_cgroup_data, given that the number of leaked + * cgroups is bound and highly unlikely to be high, this seems to be the + * better trade-off. + */ struct sock_cgroup_data { - u16 prioidx; - u32 classid; + union { +#ifdef __LITTLE_ENDIAN + struct { + u8 is_data; + u8 padding; + u16 prioidx; + u32 classid; + } __packed; +#else + struct { + u32 classid; + u16 prioidx; + u8 padding; + u8 is_data; + } __packed; +#endif + u64 val; + }; }; +/* + * There's a theoretical window where the following accessors race with + * updaters and return part of the previous pointer as the prioidx or + * classid. Such races are short-lived and the result isn't critical. + */ static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) { - return skcd->prioidx; + /* fallback to 1 which is always the ID of the root cgroup */ + return (skcd->is_data & 1) ? skcd->prioidx : 1; } static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) { - return skcd->classid; + /* fallback to 0 which is the unconfigured default classid */ + return (skcd->is_data & 1) ? skcd->classid : 0; } +/* + * If invoked concurrently, the updaters may clobber each other. The + * caller is responsible for synchronization. + */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - skcd->prioidx = prioidx; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_prioidx(&skcd_buf) == prioidx) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.prioidx = prioidx; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - skcd->classid = classid; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_classid(&skcd_buf) == classid) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.classid = classid; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } #else /* CONFIG_SOCK_CGROUP_DATA */ |