From 2a7851bffb008ff4882eee673da74718997b4265 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 May 2013 03:56:10 +0000 Subject: netfilter: add nf_ipv6_ops hook to fix xt_addrtype with IPv6 Quoting https://bugzilla.netfilter.org/show_bug.cgi?id=812: [ ip6tables -m addrtype ] When I tried to use in the nat/PREROUTING it messes up the routing cache even if the rule didn't matched at all. [..] If I remove the --limit-iface-in from the non-working scenario, so just use the -m addrtype --dst-type LOCAL it works! This happens when LOCAL type matching is requested with --limit-iface-in, and the default ipv6 route is via the interface the packet we test arrived on. Because xt_addrtype uses ip6_route_output, the ipv6 routing implementation creates an unwanted cached entry, and the packet won't make it to the real/expected destination. Silently ignoring --limit-iface-in makes the routing work but it breaks rule matching (--dst-type LOCAL with limit-iface-in is supposed to only match if the dst address is configured on the incoming interface; without --limit-iface-in it will match if the address is reachable via lo). The test should call ipv6_chk_addr() instead. However, this would add a link-time dependency on ipv6. There are two possible solutions: 1) Revert the commit that moved ipt_addrtype to xt_addrtype, and put ipv6 specific code into ip6t_addrtype. 2) add new "nf_ipv6_ops" struct to register pointers to ipv6 functions. While the former might seem preferable, Pablo pointed out that there are more xt modules with link-time dependeny issues regarding ipv6, so lets go for 2). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 98ffb54988b6..2d4df6ce043e 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -17,6 +17,22 @@ extern __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, extern int ipv6_netfilter_init(void); extern void ipv6_netfilter_fini(void); + +/* + * Hook functions for ipv6 to allow xt_* modules to be built-in even + * if IPv6 is a module. + */ +struct nf_ipv6_ops { + int (*chk_addr)(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict); +}; + +extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; +static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) +{ + return rcu_dereference(nf_ipv6_ops); +} + #else /* CONFIG_NETFILTER */ static inline int ipv6_netfilter_init(void) { return 0; } static inline void ipv6_netfilter_fini(void) { return; } -- cgit v1.2.3 From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:50:24 +0900 Subject: cgroup: fix a subtle bug in descendant pre-order walk When cgroup_next_descendant_pre() initiates a walk, it checks whether the subtree root doesn't have any children and if not returns NULL. Later code assumes that the subtree isn't empty. This is broken because the subtree may become empty inbetween, which can lead to the traversal escaping the subtree by walking to the sibling of the subtree root. There's no reason to have the early exit path. Remove it along with the later assumption that the subtree isn't empty. This simplifies the code a bit and fixes the subtle bug. While at it, fix the comment of cgroup_for_each_descendant_pre() which was incorrectly referring to ->css_offline() instead of ->css_online(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Cc: stable@vger.kernel.org --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5047355b9a0f..8bda1294c035 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -707,7 +707,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant cgroup which finished ->css_offline() is + * iteration, any descendant cgroup which finished ->css_online() is * guaranteed to be visible in the future iterations. * * In other words, the following guarantees that a descendant can't escape diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38b136553044..31e9ef319070 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } -- cgit v1.2.3 From 12bcbe66d7b3cc9f9f86cd02f925666eaa3c2107 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 14:38:42 -0400 Subject: rcu: Add _notrace variation of rcu_dereference_raw() and hlist_for_each_entry_rcu() As rcu_dereference_raw() under RCU debug config options can add quite a bit of checks, and that tracing uses rcu_dereference_raw(), these checks happen with the function tracer. The function tracer also happens to trace these debug checks too. This added overhead can livelock the system. Add a new interface to RCU for both rcu_dereference_raw_notrace() as well as hlist_for_each_entry_rcu_notrace() as the hlist iterator uses the rcu_dereference_raw() as well, and is used a bit with the function tracer. Link: http://lkml.kernel.org/r/20130528184209.304356745@goodmis.org Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/linux/rculist.h | 20 ++++++++++++++++++++ include/linux/rcupdate.h | 9 +++++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8089e35d47ac..f4b1001a4676 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -460,6 +460,26 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) +/** + * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + * + * This is the same as hlist_for_each_entry_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define hlist_for_each_entry_rcu_notrace(pos, head, member) \ + for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) + /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 4ccd68e49b00..ddcc7826d907 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -640,6 +640,15 @@ static inline void rcu_preempt_sleep_check(void) #define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ +/* + * The tracing infrastructure traces RCU (we want that), but unfortunately + * some of the RCU checks causes tracing to lock up the system. + * + * The tracing version of rcu_dereference_raw() must not call + * rcu_read_lock_held(). + */ +#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) + /** * rcu_access_index() - fetch RCU index with no dereferencing * @p: The index to read -- cgit v1.2.3 From 37448adfc7ce0d6d5892b87aa8d57edde4126f49 Mon Sep 17 00:00:00 2001 From: Lance Ortiz Date: Thu, 30 May 2013 08:25:12 -0600 Subject: aerdrv: Move cper_print_aer() call out of interrupt context The following warning was seen on 3.9 when a corrected PCIe error was being handled by the AER subsystem. WARNING: at .../drivers/pci/search.c:214 pci_get_dev_by_id+0x8a/0x90() This occurred because a call to pci_get_domain_bus_and_slot() was added to cper_print_pcie() to setup for the call to cper_print_aer(). The warning showed up because cper_print_pcie() is called in an interrupt context and pci_get* functions are not supposed to be called in that context. The solution is to move the cper_print_aer() call out of the interrupt context and into aer_recover_work_func() to avoid any warnings when calling pci_get* functions. Signed-off-by: Lance Ortiz Acked-by: Borislav Petkov Acked-by: Rafael J. Wysocki Signed-off-by: Tony Luck --- drivers/acpi/apei/cper.c | 18 ------------------ drivers/acpi/apei/ghes.c | 4 +++- drivers/pci/pcie/aer/aerdrv_core.c | 5 ++++- drivers/pci/pcie/aer/aerdrv_errprint.c | 4 ++-- include/linux/aer.h | 5 +++-- 5 files changed, 12 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index fefc2ca7cc3e..33dc6a004802 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -250,10 +250,6 @@ static const char *cper_pcie_port_type_strs[] = { static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, const struct acpi_hest_generic_data *gdata) { -#ifdef CONFIG_ACPI_APEI_PCIEAER - struct pci_dev *dev; -#endif - if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ? @@ -285,20 +281,6 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, printk( "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n", pfx, pcie->bridge.secondary_status, pcie->bridge.control); -#ifdef CONFIG_ACPI_APEI_PCIEAER - dev = pci_get_domain_bus_and_slot(pcie->device_id.segment, - pcie->device_id.bus, pcie->device_id.function); - if (!dev) { - pr_err("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n", - pcie->device_id.segment, pcie->device_id.bus, - pcie->device_id.slot, pcie->device_id.function); - return; - } - if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) - cper_print_aer(pfx, dev, gdata->error_severity, - (struct aer_capability_regs *) pcie->aer_info); - pci_dev_put(dev); -#endif } static const char *apei_estatus_section_flag_strs[] = { diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d668a8ae602b..403baf4dffc1 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -454,7 +454,9 @@ static void ghes_do_proc(struct ghes *ghes, aer_severity = cper_severity_to_aer(sev); aer_recover_queue(pcie_err->device_id.segment, pcie_err->device_id.bus, - devfn, aer_severity); + devfn, aer_severity, + (struct aer_capability_regs *) + pcie_err->aer_info); } } diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index 8ec8b4f48560..0f4554e48cc5 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -580,6 +580,7 @@ struct aer_recover_entry u8 devfn; u16 domain; int severity; + struct aer_capability_regs *regs; }; static DEFINE_KFIFO(aer_recover_ring, struct aer_recover_entry, @@ -593,7 +594,7 @@ static DEFINE_SPINLOCK(aer_recover_ring_lock); static DECLARE_WORK(aer_recover_work, aer_recover_work_func); void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, - int severity) + int severity, struct aer_capability_regs *aer_regs) { unsigned long flags; struct aer_recover_entry entry = { @@ -601,6 +602,7 @@ void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, .devfn = devfn, .domain = domain, .severity = severity, + .regs = aer_regs, }; spin_lock_irqsave(&aer_recover_ring_lock, flags); @@ -627,6 +629,7 @@ static void aer_recover_work_func(struct work_struct *work) PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); continue; } + cper_print_aer(pdev, entry.severity, entry.regs); do_recovery(pdev, entry.severity); pci_dev_put(pdev); } diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 5ab14251839d..2c7c9f5f592c 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c @@ -220,7 +220,7 @@ int cper_severity_to_aer(int cper_severity) } EXPORT_SYMBOL_GPL(cper_severity_to_aer); -void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity, +void cper_print_aer(struct pci_dev *dev, int cper_severity, struct aer_capability_regs *aer) { int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0; @@ -244,7 +244,7 @@ void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity, agent = AER_GET_AGENT(aer_severity, status); dev_err(&dev->dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); - cper_print_bits(prefix, status, status_strs, status_strs_size); + cper_print_bits("", status, status_strs, status_strs_size); dev_err(&dev->dev, "aer_layer=%s, aer_agent=%s\n", aer_error_layer[layer], aer_agent_string[agent]); if (aer_severity != AER_CORRECTABLE) diff --git a/include/linux/aer.h b/include/linux/aer.h index ec10e1b24c1c..737f90ab4b62 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -49,10 +49,11 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) } #endif -extern void cper_print_aer(const char *prefix, struct pci_dev *dev, +extern void cper_print_aer(struct pci_dev *dev, int cper_severity, struct aer_capability_regs *aer); extern int cper_severity_to_aer(int cper_severity); extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, - int severity); + int severity, + struct aer_capability_regs *aer_regs); #endif //_AER_H_ -- cgit v1.2.3 From 1e2bd517c108816220f262d7954b697af03b5f9c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 30 May 2013 06:45:27 +0000 Subject: udp6: Fix udp fragmentation for tunnel traffic. udp6 over GRE tunnel does not work after to GRE tso changes. GRE tso handler passes inner packet but keeps track of outer header start in SKB_GSO_CB(skb)->mac_offset. udp6 fragment need to take care of outer header, which start at the mac_offset, while adding fragment header. This bug is introduced by commit 68c3316311 (GRE: Add TCP segmentation offload for GRE). Reported-by: Dmitry Kravkov Signed-off-by: Pravin B Shelar Tested-by: Dmitry Kravkov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ net/ipv6/udp_offload.c | 20 ++++++++++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2e0ced1af3b1..9c676eae3968 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2852,6 +2852,21 @@ static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) SKB_GSO_CB(inner_skb)->mac_offset; } +static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) +{ + int new_headroom, headroom; + int ret; + + headroom = skb_headroom(skb); + ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); + if (ret) + return ret; + + new_headroom = skb_headroom(skb); + SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); + return 0; +} + static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 3bb3a891a424..d3cfaf9c7a08 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -46,11 +46,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, unsigned int mss; unsigned int unfrag_ip6hlen, unfrag_len; struct frag_hdr *fptr; - u8 *mac_start, *prevhdr; + u8 *packet_start, *prevhdr; u8 nexthdr; u8 frag_hdr_sz = sizeof(struct frag_hdr); int offset; __wsum csum; + int tnl_hlen; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -83,9 +84,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; /* Check if there is enough headroom to insert fragment header. */ - if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) && - pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) - goto out; + tnl_hlen = skb_tnl_header_len(skb); + if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. @@ -93,11 +96,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = skb_network_header(skb) - skb_mac_header(skb) + - unfrag_ip6hlen; - mac_start = skb_mac_header(skb); - memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len); + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; skb->mac_header -= frag_hdr_sz; skb->network_header -= frag_hdr_sz; -- cgit v1.2.3 From 6d7581e62f8be462440d7b22c6361f7c9fa4902b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 29 May 2013 05:02:56 +0000 Subject: list: introduce list_first_entry_or_null non-rcu variant of list_first_or_null_rcu Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/list.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 6a1f8df9144b..b83e5657365a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -361,6 +361,17 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 May 2013 09:06:27 +0000 Subject: net: force a reload of first item in hlist_nulls_for_each_entry_rcu Roman Gushchin discovered that udp4_lib_lookup2() was not reloading first item in the rcu protected list, in case the loop was restarted. This produced soft lockups as in https://lkml.org/lkml/2013/4/16/37 rcu_dereference(X)/ACCESS_ONCE(X) seem to not work as intended if X is ptr->field : In some cases, gcc caches the value or ptr->field in a register. Use a barrier() to disallow such caching, as documented in Documentation/atomic_ops.txt line 114 Thanks a lot to Roman for providing analysis and numerous patches. Diagnosed-by: Roman Gushchin Signed-off-by: Eric Dumazet Reported-by: Boris Zhmurov Signed-off-by: Roman Gushchin Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/rculist_nulls.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 2ae13714828b..1c33dd7da4a7 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -105,9 +105,14 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @head: the head for your list. * @member: the name of the hlist_nulls_node within the struct. * + * The barrier() is needed to make sure compiler doesn't cache first element [1], + * as this loop can be restarted [2] + * [1] Documentation/atomic_ops.txt around line 114 + * [2] Documentation/RCU/rculist_nulls.txt around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + for (({barrier();}), \ + pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) -- cgit v1.2.3 From a7526eb5d06b0084ef12d7b168d008fcf516caab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 5 Jun 2013 19:38:26 +0000 Subject: net: Unbreak compat_sys_{send,recv}msg I broke them in this commit: commit 1be374a0518a288147c6a7398792583200a67261 Author: Andy Lutomirski Date: Wed May 22 14:07:44 2013 -0700 net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg This patch adds __sys_sendmsg and __sys_sendmsg as common helpers that accept MSG_CMSG_COMPAT and blocks MSG_CMSG_COMPAT at the syscall entrypoints. It also reverts some unnecessary checks in sys_socketcall. Apparently I was suffering from underscore blindness the first time around. Signed-off-by: Andy Lutomirski Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/socket.h | 3 +++ net/compat.c | 13 +++++++-- net/socket.c | 72 +++++++++++++++++++++++--------------------------- 3 files changed, 47 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 33bf2dfab19d..b10ce4b341ea 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -320,6 +320,9 @@ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); struct timespec; +/* The __sys_...msg variants allow MSG_CMSG_COMPAT */ +extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags); +extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec *timeout); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, diff --git a/net/compat.c b/net/compat.c index 79ae88485001..f0a1ba6c8086 100644 --- a/net/compat.c +++ b/net/compat.c @@ -734,19 +734,25 @@ static unsigned char nas[21] = { asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { - return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags) { + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { - return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned int flags) @@ -768,6 +774,9 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, int datagrams; struct timespec ktspec; + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + if (COMPAT_USE_64BIT_TIME) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, diff --git a/net/socket.c b/net/socket.c index 9ff6366fee13..4ca1526db756 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1956,7 +1956,7 @@ struct used_address { unsigned int name_len; }; -static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg, +static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address) { @@ -2071,26 +2071,30 @@ out: * BSD sendmsg interface */ -SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags) +long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); fput_light(sock->file, fput_needed); out: return err; } +SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_sendmsg(fd, msg, flags); +} + /* * Linux sendmmsg interface */ @@ -2121,15 +2125,16 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, while (datagrams < vlen) { if (MSG_CMSG_COMPAT & flags) { - err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry, - &msg_sys, flags, &used_address); + err = ___sys_sendmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags, &used_address); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { - err = __sys_sendmsg(sock, (struct msghdr __user *)entry, - &msg_sys, flags, &used_address); + err = ___sys_sendmsg(sock, + (struct msghdr __user *)entry, + &msg_sys, flags, &used_address); if (err < 0) break; err = put_user(err, &entry->msg_len); @@ -2158,7 +2163,7 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, return __sys_sendmmsg(fd, mmsg, vlen, flags); } -static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg, +static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = @@ -2250,27 +2255,31 @@ out: * BSD recvmsg interface */ -SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, - unsigned int, flags) +long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0); + err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); fput_light(sock->file, fput_needed); out: return err; } +SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, + unsigned int, flags) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_recvmsg(fd, msg, flags); +} + /* * Linux recvmmsg interface */ @@ -2308,17 +2317,18 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, * No need to ask LSM for more than the first datagram. */ if (MSG_CMSG_COMPAT & flags) { - err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry, - &msg_sys, flags & ~MSG_WAITFORONE, - datagrams); + err = ___sys_recvmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { - err = __sys_recvmsg(sock, (struct msghdr __user *)entry, - &msg_sys, flags & ~MSG_WAITFORONE, - datagrams); + err = ___sys_recvmsg(sock, + (struct msghdr __user *)entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); @@ -2505,31 +2515,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) (int __user *)a[4]); break; case SYS_SENDMSG: - if (a[2] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_SENDMMSG: - if (a[3] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]); break; case SYS_RECVMSG: - if (a[2] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMMSG: - if (a[3] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct timespec __user *)a[4]); break; -- cgit v1.2.3