diff options
Diffstat (limited to 'net')
126 files changed, 1995 insertions, 798 deletions
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index c00897f65a31..425942db17f6 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr copied = size; msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, offset, msg, copied); if (!err && msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name); diff --git a/net/atm/common.c b/net/atm/common.c index 6a765156a3f6..9cd1ccae9a11 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, msg->msg_flags |= MSG_TRUNC; } - error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + error = skb_copy_datagram_msg(skb, 0, msg, copied); if (error) return error; sock_recv_ts_and_drops(msg, sk, skb); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c35c3f48fc0f..f4f835e19378 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { ax25_digi digi; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 339c74ad4553..0a7cc565f93e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); @@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } chunk = min_t(unsigned int, skb->len, size); - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, 0, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 115f149362ba..29e1ec7189bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 44cb786b925a..f96933a823e3 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -184,6 +184,11 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (unicast && !(p->flags & BR_FLOOD)) continue; + + /* Do not flood to ports that enable proxy ARP */ + if (p->flags & BR_PROXYARP) + continue; + prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) goto out; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6fd5522df696..1f1de715197c 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -16,6 +16,8 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> +#include <linux/neighbour.h> +#include <net/arp.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" @@ -57,6 +59,60 @@ static int br_pass_frame_up(struct sk_buff *skb) netif_receive_skb); } +static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid) +{ + struct net_device *dev = br->dev; + struct neighbour *n; + struct arphdr *parp; + u8 *arpptr, *sha; + __be32 sip, tip; + + if (dev->flags & IFF_NOARP) + return; + + if (!pskb_may_pull(skb, arp_hdr_len(dev))) { + dev->stats.tx_dropped++; + return; + } + parp = arp_hdr(skb); + + if (parp->ar_pro != htons(ETH_P_IP) || + parp->ar_op != htons(ARPOP_REQUEST) || + parp->ar_hln != dev->addr_len || + parp->ar_pln != 4) + return; + + arpptr = (u8 *)parp + sizeof(struct arphdr); + sha = arpptr; + arpptr += dev->addr_len; /* sha */ + memcpy(&sip, arpptr, sizeof(sip)); + arpptr += sizeof(sip); + arpptr += dev->addr_len; /* tha */ + memcpy(&tip, arpptr, sizeof(tip)); + + if (ipv4_is_loopback(tip) || + ipv4_is_multicast(tip)) + return; + + n = neigh_lookup(&arp_tbl, &tip, dev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = __br_fdb_get(br, n->ha, vid); + if (f) + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, + sha, n->ha, sha); + + neigh_release(n); + } +} + /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct sk_buff *skb) { @@ -98,6 +154,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; if (is_broadcast_ether_addr(dest)) { + if (p->flags & BR_PROXYARP && + skb->protocol == htons(ETH_P_ARP)) + br_do_proxy_arp(skb, br, vid); + skb2 = skb; unicast = false; } else if (is_multicast_ether_addr(dest)) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ff9706647f2..86c239b06f6e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -60,7 +60,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || - nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD))) + nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP))) return -EMSGSIZE; return 0; @@ -332,6 +333,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4d783d071305..8f3f08140258 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -172,6 +172,7 @@ struct net_bridge_port #define BR_FLOOD 0x00000040 #define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING) #define BR_PROMISC 0x00000080 +#define BR_PROXYARP 0x00000100 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index e561cd59b8a6..2de5d91199e8 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -170,6 +170,7 @@ BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); +BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -213,6 +214,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, #endif + &brport_attr_proxyarp, NULL }; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 43f750e88e19..fbcd156099fb 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, copylen = len; } - ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen); + ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; diff --git a/net/core/dev.c b/net/core/dev.c index 945bbd001359..70bb609c283d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> +#include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> @@ -2530,7 +2531,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; @@ -4316,20 +4317,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); } +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); -#ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ - if (sd->rps_ipi_list) { + if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } -#endif + napi->weight = weight_p; local_irq_disable(); while (1) { @@ -4356,7 +4365,6 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - list_del(&napi->poll_list); napi->state = 0; rps_unlock(sd); @@ -4376,7 +4384,8 @@ static int process_backlog(struct napi_struct *napi, int quota) * __napi_schedule - schedule for receive * @n: entry to schedule * - * The entry's receive function will be scheduled to run + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { @@ -4388,12 +4397,24 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); BUG_ON(n->gro_list); - list_del(&n->poll_list); + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } @@ -4411,9 +4432,15 @@ void napi_complete(struct napi_struct *n) return; napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); + + if (likely(list_empty(&n->poll_list))) { + WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + } else { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); + } } EXPORT_SYMBOL(napi_complete); @@ -4507,29 +4534,28 @@ static void net_rx_action(struct softirq_action *h) struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; + LIST_HEAD(list); + LIST_HEAD(repoll); void *have; local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); - while (!list_empty(&sd->poll_list)) { + while (!list_empty(&list)) { struct napi_struct *n; int work, weight; - /* If softirq window is exhuasted then punt. + /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; - local_irq_enable(); - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + n = list_first_entry(&list, struct napi_struct, poll_list); + list_del_init(&n->poll_list); have = netpoll_poll_lock(n); @@ -4551,8 +4577,6 @@ static void net_rx_action(struct softirq_action *h) budget -= work; - local_irq_disable(); - /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can @@ -4560,32 +4584,40 @@ static void net_rx_action(struct softirq_action *h) */ if (unlikely(work == weight)) { if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); napi_complete(n); - local_irq_disable(); } else { if (n->gro_list) { /* flush too old packets * If HZ < 1000, flush all packets. */ - local_irq_enable(); napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); } - list_move_tail(&n->poll_list, &sd->poll_list); + list_add_tail(&n->poll_list, &repoll); } } netpoll_poll_unlock(have); } + + if (!sd_has_rps_ipi_waiting(sd) && + list_empty(&list) && + list_empty(&repoll)) + return; out: + local_irq_disable(); + + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + net_rps_action_and_irq_enable(sd); return; softnet_break: sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 06dfb293e5aa..b0f84f5ddda8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -84,7 +84,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef31fef25e5a..edd04116ecb7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -773,7 +773,7 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; tbl->last_rand = jiffies; - for (p = &tbl->parms; p; p = p->next) + list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } @@ -1446,7 +1446,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, { struct neigh_parms *p; - for (p = &tbl->parms; p; p = p->next) { + list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; @@ -1481,8 +1481,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, } write_lock_bh(&tbl->lock); - p->next = tbl->parms.next; - tbl->parms.next = p; + list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); @@ -1501,24 +1500,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head) void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { - struct neigh_parms **p; - if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); - for (p = &tbl->parms.next; *p; p = &(*p)->next) { - if (*p == parms) { - *p = parms->next; - parms->dead = 1; - write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); - call_rcu(&parms->rcu_head, neigh_rcu_free_parms); - return; - } - } + list_del(&parms->list); + parms->dead = 1; write_unlock_bh(&tbl->lock); - neigh_dbg(1, "%s: not found\n", __func__); + if (parms->dev) + dev_put(parms->dev); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -1535,6 +1525,8 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) unsigned long now = jiffies; unsigned long phsize; + INIT_LIST_HEAD(&tbl->parms_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); atomic_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = @@ -2154,7 +2146,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) NLM_F_MULTI) <= 0) break; - for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + nidx = 0; + p = list_next_entry(&tbl->parms, list); + list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c16615bfb61e..700189604f3d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3013,7 +3013,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg) { + if (!sg && !nskb->remcsum_offload) { nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), @@ -3085,7 +3085,7 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum) { + if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; @@ -3099,6 +3099,16 @@ perform_csum_check: * (see validate_xmit_skb_list() for example) */ segs->prev = tail; + + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } return segs; err: diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67b1069..ac56dd06c306 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5ab6627cf370..8e6ae9422a7b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -896,7 +896,7 @@ verify_sock_status: else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index a585fd6352eb..5f8ac404535b 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -11,6 +11,17 @@ config NET_DSA if NET_DSA +config NET_DSA_HWMON + bool "Distributed Switch Architecture HWMON support" + default y + depends on HWMON && !(NET_DSA=y && HWMON=m) + ---help--- + Say Y if you want to expose thermal sensor data on switches supported + by the Distributed Switch Architecture. + + Some of those switches contain thermal sensors. This data is available + via the hwmon sysfs interface and exposes the onboard sensors. + # tagging formats config NET_DSA_TAG_BRCM bool diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6317b41c99b0..dd646a8025cb 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -9,6 +9,9 @@ * (at your option) any later version. */ +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/hwmon.h> #include <linux/list.h> #include <linux/platform_device.h> #include <linux/slab.h> @@ -17,6 +20,7 @@ #include <linux/of.h> #include <linux/of_mdio.h> #include <linux/of_platform.h> +#include <linux/sysfs.h> #include "dsa_priv.h" char dsa_driver_version[] = "0.1"; @@ -71,6 +75,104 @@ dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name) return ret; } +/* hwmon support ************************************************************/ + +#ifdef CONFIG_NET_DSA_HWMON + +static ssize_t temp1_input_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + int temp, ret; + + ret = ds->drv->get_temp(ds, &temp); + if (ret < 0) + return ret; + + return sprintf(buf, "%d\n", temp * 1000); +} +static DEVICE_ATTR_RO(temp1_input); + +static ssize_t temp1_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + int temp, ret; + + ret = ds->drv->get_temp_limit(ds, &temp); + if (ret < 0) + return ret; + + return sprintf(buf, "%d\n", temp * 1000); +} + +static ssize_t temp1_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + int temp, ret; + + ret = kstrtoint(buf, 0, &temp); + if (ret < 0) + return ret; + + ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR(temp1_max, S_IRUGO, temp1_max_show, temp1_max_store); + +static ssize_t temp1_max_alarm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + bool alarm; + int ret; + + ret = ds->drv->get_temp_alarm(ds, &alarm); + if (ret < 0) + return ret; + + return sprintf(buf, "%d\n", alarm); +} +static DEVICE_ATTR_RO(temp1_max_alarm); + +static struct attribute *dsa_hwmon_attrs[] = { + &dev_attr_temp1_input.attr, /* 0 */ + &dev_attr_temp1_max.attr, /* 1 */ + &dev_attr_temp1_max_alarm.attr, /* 2 */ + NULL +}; + +static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dsa_switch *ds = dev_get_drvdata(dev); + struct dsa_switch_driver *drv = ds->drv; + umode_t mode = attr->mode; + + if (index == 1) { + if (!drv->get_temp_limit) + mode = 0; + else if (drv->set_temp_limit) + mode |= S_IWUSR; + } else if (index == 2 && !drv->get_temp_alarm) { + mode = 0; + } + return mode; +} + +static const struct attribute_group dsa_hwmon_group = { + .attrs = dsa_hwmon_attrs, + .is_visible = dsa_hwmon_attrs_visible, +}; +__ATTRIBUTE_GROUPS(dsa_hwmon); + +#endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ static struct dsa_switch * @@ -228,6 +330,31 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->ports[i] = slave_dev; } +#ifdef CONFIG_NET_DSA_HWMON + /* If the switch provides a temperature sensor, + * register with hardware monitoring subsystem. + * Treat registration error as non-fatal and ignore it. + */ + if (drv->get_temp) { + const char *netname = netdev_name(dst->master_netdev); + char hname[IFNAMSIZ + 1]; + int i, j; + + /* Create valid hwmon 'name' attribute */ + for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) { + if (isalnum(netname[i])) + hname[j++] = netname[i]; + } + hname[j] = '\0'; + scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", + hname, index); + ds->hwmon_dev = hwmon_device_register_with_groups(NULL, + ds->hwmon_name, ds, dsa_hwmon_groups); + if (IS_ERR(ds->hwmon_dev)) + ds->hwmon_dev = NULL; + } +#endif /* CONFIG_NET_DSA_HWMON */ + return ds; out_free: @@ -239,6 +366,10 @@ out: static void dsa_switch_destroy(struct dsa_switch *ds) { +#ifdef CONFIG_NET_DSA_HWMON + if (ds->hwmon_dev) + hwmon_device_unregister(ds->hwmon_dev); +#endif } #ifdef CONFIG_PM_SLEEP @@ -447,6 +578,7 @@ static int dsa_of_probe(struct platform_device *pdev) const char *port_name; int chip_index, port_index; const unsigned int *sw_addr, *port_reg; + u32 eeprom_len; int ret; mdio = of_parse_phandle(np, "dsa,mii-bus", 0); @@ -498,6 +630,9 @@ static int dsa_of_probe(struct platform_device *pdev) if (cd->sw_addr > PHY_MAX_ADDR) continue; + if (!of_property_read_u32(np, "eeprom-length", &eeprom_len)) + cd->eeprom_len = eeprom_len; + for_each_available_child_of_node(child, port) { port_reg = of_get_property(port, "reg", NULL); if (!port_reg) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ab03e00ffe8f..0ea466dad818 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -249,6 +249,27 @@ static void dsa_slave_get_drvinfo(struct net_device *dev, strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } +static int dsa_slave_get_regs_len(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->get_regs_len) + return ds->drv->get_regs_len(ds, p->port); + + return -EOPNOTSUPP; +} + +static void +dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->get_regs) + ds->drv->get_regs(ds, p->port, regs, _p); +} + static int dsa_slave_nway_reset(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -271,6 +292,44 @@ static u32 dsa_slave_get_link(struct net_device *dev) return -EOPNOTSUPP; } +static int dsa_slave_get_eeprom_len(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->pd->eeprom_len) + return ds->pd->eeprom_len; + + if (ds->drv->get_eeprom_len) + return ds->drv->get_eeprom_len(ds); + + return 0; +} + +static int dsa_slave_get_eeprom(struct net_device *dev, + struct ethtool_eeprom *eeprom, u8 *data) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->get_eeprom) + return ds->drv->get_eeprom(ds, eeprom, data); + + return -EOPNOTSUPP; +} + +static int dsa_slave_set_eeprom(struct net_device *dev, + struct ethtool_eeprom *eeprom, u8 *data) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->set_eeprom) + return ds->drv->set_eeprom(ds, eeprom, data); + + return -EOPNOTSUPP; +} + static void dsa_slave_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { @@ -385,8 +444,13 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, .get_drvinfo = dsa_slave_get_drvinfo, + .get_regs_len = dsa_slave_get_regs_len, + .get_regs = dsa_slave_get_regs, .nway_reset = dsa_slave_nway_reset, .get_link = dsa_slave_get_link, + .get_eeprom_len = dsa_slave_get_eeprom_len, + .get_eeprom = dsa_slave_get_eeprom, + .set_eeprom = dsa_slave_set_eeprom, .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index ce90c8bdc658..2dab27063273 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -63,8 +63,6 @@ static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) dsa_header[3] = 0x00; } - skb->protocol = htons(ETH_P_DSA); - skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 94fcce778679..9aeda596f7ec 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -76,8 +76,6 @@ static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[7] = 0x00; } - skb->protocol = htons(ETH_P_EDSA); - skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 115fdca34077..e268f9db8893 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -57,8 +57,6 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) trailer[2] = 0x10; trailer[3] = 0x00; - nskb->protocol = htons(ETH_P_TRAILER); - nskb->dev = p->parent->dst->master_netdev; dev_queue_xmit(nskb); diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index ef2ad8aaef13..fc9193eabd41 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, } /* FIXME: skip headers if necessary ?! */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 9d1f64806f02..73a4d53463de 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e682b48e0709..bd2901604842 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -322,6 +322,15 @@ config NET_FOU network mechanisms and optimizations for UDP (such as ECMP and RSS) can be leveraged to provide better service. +config NET_FOU_IP_TUNNELS + bool "IP: FOU encapsulation of IP tunnels" + depends on NET_IPIP || NET_IPGRE || IPV6_SIT + select NET_FOU + ---help--- + Allow configuration of FOU or GUE encapsulation for IP tunnels. + When this option is enabled IP tunnels can be configured to use + FOU or GUE encapsulation. + config GENEVE tristate "Generic Network Virtualization Encapsulation (Geneve)" depends on INET diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b7fe5b03906..3a096bb2d596 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1222,7 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_MPLS | + SKB_GSO_TUNNEL_REMCSUM | 0))) goto out; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4715f25dfe03..5160c710f2eb 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -50,7 +50,7 @@ #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> -#include <asm/bug.h> +#include <linux/bug.h> #include <asm/unaligned.h> /* List of available DOI definitions */ @@ -72,6 +72,7 @@ struct cipso_v4_map_cache_bkt { u32 size; struct list_head list; }; + struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; @@ -82,7 +83,8 @@ struct cipso_v4_map_cache_entry { u32 activity; struct list_head list; }; -static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL; + +static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt = 0; @@ -539,7 +541,7 @@ doi_add_return: /** * cipso_v4_doi_free - Frees a DOI definition - * @entry: the entry's RCU field + * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 360b565918c4..60173d4d3a0e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -392,8 +392,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0) goto out; - if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + err = skb_cow_data(skb, 0, &trailer); + if (err < 0) goto out; + nfrags = err; assoclen = sizeof(*esph); @@ -601,12 +603,12 @@ static int esp_init_authenc(struct xfrm_state *x) BUG_ON(!aalg_desc); err = -EINVAL; - if (aalg_desc->uinfo.auth.icv_fullbits/8 != + if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits / 8); goto free_key; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 32e78924e246..740ae099a0d9 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -38,21 +38,17 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static int fou_udp_encap_recv_deliver(struct sk_buff *skb, - u8 protocol, size_t len) +static void fou_recv_pull(struct sk_buff *skb, size_t len) { struct iphdr *iph = ip_hdr(skb); /* Remove 'len' bytes from the packet (UDP header and - * FOU header if present), modify the protocol to the one - * we found, and then call rcv_encap. + * FOU header if present). */ iph->tot_len = htons(ntohs(iph->tot_len) - len); __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); - - return -protocol; } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -62,16 +58,78 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - return fou_udp_encap_recv_deliver(skb, fou->protocol, - sizeof(struct udphdr)); + fou_recv_pull(skb, sizeof(struct udphdr)); + + return -fou->protocol; +} + +static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, + void *data, int hdrlen, u8 ipproto) +{ + __be16 *pd = data; + u16 start = ntohs(pd[0]); + u16 offset = ntohs(pd[1]); + u16 poffset = 0; + u16 plen; + __wsum csum, delta; + __sum16 *psum; + + if (skb->remcsum_offload) { + /* Already processed in GRO path */ + skb->remcsum_offload = 0; + return guehdr; + } + + if (start > skb->len - hdrlen || + offset > skb->len - hdrlen - sizeof(u16)) + return NULL; + + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) + __skb_checksum_complete(skb); + + plen = hdrlen + offset + sizeof(u16); + if (!pskb_may_pull(skb, plen)) + return NULL; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + if (ipproto == IPPROTO_IP && sizeof(struct iphdr) < plen) { + struct iphdr *ip = (struct iphdr *)(skb->data + hdrlen); + + /* If next header happens to be IP we can skip that for the + * checksum calculation since the IP header checksum is zero + * if correct. + */ + poffset = ip->ihl * 4; + } + + csum = csum_sub(skb->csum, skb_checksum(skb, poffset + hdrlen, + start - poffset - hdrlen, 0)); + + /* Set derived checksum in packet */ + psum = (__sum16 *)(skb->data + hdrlen + offset); + delta = csum_sub(csum_fold(csum), *psum); + *psum = csum_fold(csum); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + + return guehdr; +} + +static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) +{ + /* No support yet */ + kfree_skb(skb); + return 0; } static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); - size_t len; + size_t len, optlen, hdrlen; struct guehdr *guehdr; - struct udphdr *uh; + void *data; + u16 doffset = 0; if (!fou) return 1; @@ -80,25 +138,61 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + optlen = guehdr->hlen << 2; + len += optlen; - len += guehdr->hlen << 2; if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + /* guehdr may change after pull */ + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - if (guehdr->version != 0) - goto drop; + hdrlen = sizeof(struct guehdr) + optlen; - if (guehdr->flags) { - /* No support yet */ + if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto drop; + + hdrlen = sizeof(struct guehdr) + optlen; + + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + + /* Pull UDP header now, skb->data points to guehdr */ + __skb_pull(skb, sizeof(struct udphdr)); + + /* Pull csum through the guehdr now . This can be used if + * there is a remote checksum offload. + */ + skb_postpull_rcsum(skb, udp_hdr(skb), len); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_remcsum(skb, guehdr, data + doffset, + hdrlen, guehdr->proto_ctype); + if (!guehdr) + goto drop; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } - return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); + if (unlikely(guehdr->control)) + return gue_control_message(skb, guehdr); + + __skb_pull(skb, hdrlen); + skb_reset_transport_header(skb); + + return -guehdr->proto_ctype; + drop: kfree_skb(skb); return 0; @@ -147,6 +241,66 @@ out_unlock: return err; } +static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, + struct guehdr *guehdr, void *data, + size_t hdrlen, u8 ipproto) +{ + __be16 *pd = data; + u16 start = ntohs(pd[0]); + u16 offset = ntohs(pd[1]); + u16 poffset = 0; + u16 plen; + void *ptr; + __wsum csum, delta; + __sum16 *psum; + + if (skb->remcsum_offload) + return guehdr; + + if (start > skb_gro_len(skb) - hdrlen || + offset > skb_gro_len(skb) - hdrlen - sizeof(u16) || + !NAPI_GRO_CB(skb)->csum_valid || skb->remcsum_offload) + return NULL; + + plen = hdrlen + offset + sizeof(u16); + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + guehdr = skb_gro_header_slow(skb, off + plen, off); + if (!guehdr) + return NULL; + } + + ptr = (void *)guehdr + hdrlen; + + if (ipproto == IPPROTO_IP && + (hdrlen + sizeof(struct iphdr) < plen)) { + struct iphdr *ip = (struct iphdr *)(ptr + hdrlen); + + /* If next header happens to be IP we can skip + * that for the checksum calculation since the + * IP header checksum is zero if correct. + */ + poffset = ip->ihl * 4; + } + + csum = csum_sub(NAPI_GRO_CB(skb)->csum, + csum_partial(ptr + poffset, start - poffset, 0)); + + /* Set derived checksum in packet */ + psum = (__sum16 *)(ptr + offset); + delta = csum_sub(csum_fold(csum), *psum); + *psum = csum_fold(csum); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + skb->remcsum_offload = 1; + + return guehdr; +} + static struct sk_buff **gue_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -154,38 +308,64 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, const struct net_offload *ops; struct sk_buff **pp = NULL; struct sk_buff *p; - u8 proto; struct guehdr *guehdr; - unsigned int hlen, guehlen; - unsigned int off; + size_t len, optlen, hdrlen, off; + void *data; + u16 doffset = 0; int flush = 1; off = skb_gro_offset(skb); - hlen = off + sizeof(*guehdr); + len = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; } - proto = guehdr->next_hdr; + optlen = guehdr->hlen << 2; + len += optlen; - rcu_read_lock(); - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) - goto out_unlock; + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); + if (unlikely(!guehdr)) + goto out; + } - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + if (unlikely(guehdr->control) || guehdr->version != 0 || + validate_gue_flags(guehdr, optlen)) + goto out; - hlen = off + guehlen; - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!guehdr)) - goto out_unlock; + hdrlen = sizeof(*guehdr) + optlen; + + /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr, + * this is needed if there is a remote checkcsum offload. + */ + skb_gro_postpull_rcsum(skb, guehdr, hdrlen); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_gro_remcsum(skb, off, guehdr, + data + doffset, hdrlen, + guehdr->proto_ctype); + if (!guehdr) + goto out; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } + skb_gro_pull(skb, hdrlen); + flush = 0; for (p = *head; p; p = p->next) { @@ -197,7 +377,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, guehdr2 = (struct guehdr *)(p->data + off); /* Compare base GUE header to be equal (covers - * hlen, version, next_hdr, and flags. + * hlen, version, proto_ctype, and flags. */ if (guehdr->word != guehdr2->word) { NAPI_GRO_CB(p)->same_flow = 0; @@ -212,10 +392,11 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } - skb_gro_pull(skb, guehlen); - - /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ - skb_gro_postpull_rcsum(skb, guehdr, guehlen); + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[guehdr->proto_ctype]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); @@ -236,7 +417,7 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff) u8 proto; int err = -ENOENT; - proto = guehdr->next_hdr; + proto = guehdr->proto_ctype; guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); @@ -487,6 +668,125 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + __be16 sport; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(fou_build_header); + +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + struct guehdr *guehdr; + size_t hdrlen, optlen = 0; + __be16 sport; + void *data; + bool need_priv = false; + + if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + csum = false; + optlen += GUE_PLEN_REMCSUM; + type |= SKB_GSO_TUNNEL_REMCSUM; + need_priv = true; + } + + optlen += need_priv ? GUE_LEN_PRIV : 0; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get source port (based on flow hash) before skb_push */ + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); + + guehdr = (struct guehdr *)skb->data; + + guehdr->control = 0; + guehdr->version = 0; + guehdr->hlen = optlen >> 2; + guehdr->flags = 0; + guehdr->proto_ctype = *protocol; + + data = &guehdr[1]; + + if (need_priv) { + __be32 *flags = data; + + guehdr->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd = data; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + + } + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(gue_build_header); + static int __init fou_init(void) { int ret; diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index dedb21e99914..31802afce34f 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -104,7 +104,7 @@ static void geneve_build_header(struct genevehdr *geneveh, memcpy(geneveh->options, options, options_len); } -/* Transmit a fully formated Geneve frame. +/* Transmit a fully formatted Geneve frame. * * When calling this function. The skb->data should point * to the geneve header which is fully formed. diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fb70e3ecc3e4..666cf364df86 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -112,17 +112,17 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) -#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) -#define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Query_Robustness_Variable 2 +#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) +#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) +#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 -#define IGMP_Initial_Report_Delay (1) +#define IGMP_INITIAL_REPORT_DELAY (1) -/* IGMP_Initial_Report_Delay is not from IGMP specs! +/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not @@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } -#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) - -static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; @@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu; while (1) { skb = alloc_skb(size + hlen + tlen, @@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } skb->priority = TC_PRIO_CONTROL; - igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, @@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_dst_set(skb, &rt->dst); skb->dev = dev; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); skb_reset_network_header(skb); @@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) @@ -879,15 +878,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (ih->code == 0) { /* Alas, old v1 router presents here. */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_Router_Present_Timeout; + IGMP_V1_ROUTER_PRESENT_TIMEOUT; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_Router_Present_Timeout; + IGMP_V2_ROUTER_PRESENT_TIMEOUT; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -899,7 +898,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; @@ -1218,7 +1217,7 @@ static void igmp_group_added(struct ip_mc_list *im) return; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); - igmp_start_timer(im, IGMP_Initial_Report_Delay); + igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } @@ -1541,7 +1540,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; #ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; #endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, @@ -2687,11 +2686,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v) struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "%3s %6s " - "%10s %10s %6s %6s\n", "Idx", - "Device", "MCA", - "SRC", "INC", "EXC"); + seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2811cc18701a..4d964dadd655 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -80,7 +80,7 @@ struct ipq { struct inet_peer *peer; }; -static inline u8 ip4_frag_ecn(u8 tos) +static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } @@ -148,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } -static __inline__ void ip4_frag_free(struct inet_frag_queue *q) +static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; @@ -160,7 +160,7 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q) /* Destruction primitives. */ -static __inline__ void ipq_put(struct ipq *ipq) +static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q, &ip4_frags); } @@ -236,7 +236,7 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -256,7 +256,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) } /* Is the fragment too far ahead to be part of ipq? */ -static inline int ip_frag_too_far(struct ipq *qp) +static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = sysctl_ipfrag_max_dist; @@ -795,16 +795,16 @@ static void __init ip4_frags_ctl_register(void) register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else -static inline int ip4_frags_ns_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ns_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void __init ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { } #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 12055fdbe716..ac8491245e5b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -789,7 +789,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, t->encap.dport) || nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, - t->encap.dport)) + t->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bc6471d4abcd..4a929adf2ab7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -662,12 +662,10 @@ slow_path: if (len < left) { len &= ~7; } - /* - * Allocate buffer. - */ - if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); + /* Allocate buffer */ + skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); + if (!skb2) { err = -ENOMEM; goto fail; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9ad4555..21894df66262 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0bb8e141eacc..c3587e1c8b82 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,7 +56,10 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/udp.h> -#include <net/gue.h> + +#if IS_ENABLED(CONFIG_NET_FOU) +#include <net/fou.h> +#endif #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -494,10 +497,12 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e) switch (e->type) { case TUNNEL_ENCAP_NONE: return 0; +#if IS_ENABLED(CONFIG_NET_FOU) case TUNNEL_ENCAP_FOU: - return sizeof(struct udphdr); + return fou_encap_hlen(e); case TUNNEL_ENCAP_GUE: - return sizeof(struct udphdr) + sizeof(struct guehdr); + return gue_encap_hlen(e); +#endif default: return -EINVAL; } @@ -526,60 +531,18 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t, } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); -static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - size_t hdr_len, u8 *protocol, struct flowi4 *fl4) -{ - struct udphdr *uh; - __be16 sport; - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - - skb = iptunnel_handle_offloads(skb, csum, type); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - /* Get length and hash before making space in skb */ - - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - if (e->type == TUNNEL_ENCAP_GUE) { - struct guehdr *guehdr = (struct guehdr *)&uh[1]; - - guehdr->version = 0; - guehdr->hlen = 0; - guehdr->flags = 0; - guehdr->next_hdr = *protocol; - } - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - uh->check = 0; - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; - - return 0; -} - int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, u8 *protocol, struct flowi4 *fl4) { switch (t->encap.type) { case TUNNEL_ENCAP_NONE: return 0; +#if IS_ENABLED(CONFIG_NET_FOU) case TUNNEL_ENCAP_FOU: + return fou_build_header(skb, &t->encap, protocol, fl4); case TUNNEL_ENCAP_GUE: - return fou_build_header(skb, &t->encap, t->encap_hlen, - protocol, fl4); + return gue_build_header(skb, &t->encap, protocol, fl4); +#endif default: return -EINVAL; } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 648fa1490ea7..7fa18bc7e47f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -115,7 +115,7 @@ */ int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ -static int ic_enable __initdata = 0; /* IP config enabled? */ +static int ic_enable __initdata; /* IP config enabled? */ /* Protocol choice */ int ic_proto_enabled __initdata = 0 @@ -130,7 +130,7 @@ int ic_proto_enabled __initdata = 0 #endif ; -static int ic_host_name_set __initdata = 0; /* Host name set by us? */ +static int ic_host_name_set __initdata; /* Host name set by us? */ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ @@ -160,17 +160,17 @@ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; /* Protocols supported by available interfaces */ -static int ic_proto_have_if __initdata = 0; +static int ic_proto_have_if __initdata; /* MTU for boot device */ -static int ic_dev_mtu __initdata = 0; +static int ic_dev_mtu __initdata; #ifdef IPCONFIG_DYNAMIC static DEFINE_SPINLOCK(ic_recv_lock); -static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +static volatile int ic_got_reply __initdata; /* Proto(s) that replied */ #endif #ifdef IPCONFIG_DHCP -static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */ #endif @@ -186,8 +186,8 @@ struct ic_device { __be32 xid; }; -static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ -static struct net_device *ic_dev __initdata = NULL; /* Selected device */ +static struct ic_device *ic_first_dev __initdata; /* List of open device */ +static struct net_device *ic_dev __initdata; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { @@ -498,7 +498,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt struct arphdr *rarp; unsigned char *rarp_ptr; __be32 sip, tip; - unsigned char *sha, *tha; /* s for "source", t for "target" */ + unsigned char *tha; /* t for "target" */ struct ic_device *d; if (!net_eq(dev_net(dev), &init_net)) @@ -549,7 +549,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop_unlock; /* should never happen */ /* Extract variable-width fields */ - sha = rarp_ptr; rarp_ptr += dev->addr_len; memcpy(&sip, rarp_ptr, 4); rarp_ptr += 4; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 37096d64730e..40403114f00a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -465,7 +465,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.dport)) + tunnel->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c9804139..736236c3e554 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8e3eb39f84e7..f0d4eb8b99b9 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -296,12 +296,12 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, int j; if (count) { - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } @@ -342,7 +342,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_printf(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db3100c23..ee8fa4bf3b73 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 32b98d0207b4..45fe60c5238e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,10 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -/* Timestamps: lowest bits store TCP options */ -#define TSBITS 6 -#define TSMASK (((__u32)1 << TSBITS) - 1) - extern int sysctl_tcp_syncookies; static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; @@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) +/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK + * stores TCP options: + * + * MSB LSB + * | 31 ... 6 | 5 | 4 | 3 2 1 0 | + * | Timestamp | ECN | SACK | WScale | + * + * When we receive a valid cookie-ACK, we look at the echoed tsval (if + * any) to figure out which TCP options we should use for the rebuilt + * connection. + * + * A WScale setting of '0xf' (which is an invalid scaling value) + * means that original syn did not include the TCP window scaling option. + */ +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK BIT(4) +#define TS_OPT_ECN BIT(5) +/* There is no TS_OPT_TIMESTAMP: + * if ACK contains timestamp option, we already know it was + * requested/supported by the syn/synack exchange. + */ +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) + static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); @@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req) ireq = inet_rsk(req); - options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; - options |= ireq->sack_ok << 4; - options |= ireq->ecn_ok << 5; + options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; + if (ireq->sack_ok) + options |= TS_OPT_SACK; + if (ireq->ecn_ok) + options |= TS_OPT_ECN; ts = ts_now & ~TSMASK; ts |= options; @@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * - * The lowest 4 bits store snd_wscale. - * next 2 bits indicate SACK and ECN support. - * - * return false if we decode an option that should not be. + * return false if we decode a tcp option that is disabled + * on the host. */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, - struct net *net, bool *ecn_ok) +bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ - u32 options = tcp_opt->rcv_tsecr & TSMASK; + u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); @@ -238,22 +257,35 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, if (!sysctl_tcp_timestamps) return false; - tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; - *ecn_ok = (options >> 5) & 1; - if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) - return false; + tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !sysctl_tcp_sack) return false; - if ((options & 0xf) == 0xf) + if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; - tcp_opt->snd_wscale = options & 0xf; + tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; + return sysctl_tcp_window_scaling != 0; } -EXPORT_SYMBOL(cookie_check_timestamp); +EXPORT_SYMBOL(cookie_timestamp_decode); + +bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, + const struct net *net, const struct dst_entry *dst) +{ + bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; + + if (!ecn_ok) + return false; + + if (net->ipv4.sysctl_tcp_ecn) + return true; + + return dst_feature(dst, RTAX_FEATURE_ECN); +} +EXPORT_SYMBOL(cookie_ecn_ok); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { @@ -269,14 +301,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) int mss; struct rtable *rt; __u8 rcv_wscale; - bool ecn_ok = false; struct flowi4 fl4; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; - if (tcp_synq_no_recent_overflow(sk) || - (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { + if (tcp_synq_no_recent_overflow(sk)) + goto out; + + mss = __cookie_v4_check(ip_hdr(skb), th, cookie); + if (mss == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } @@ -287,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + if (!cookie_timestamp_decode(&tcp_opt)) goto out; ret = NULL; @@ -305,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; @@ -354,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); ret = get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b3c53c8b331e..e0ee384a448f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_max_reordering", + .data = &sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39ec0c379545..c239f4740d10 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1833,8 +1833,7 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index b1c5970d47a1..27ead0dd16bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,5 +1,5 @@ /* - * Plugable TCP congestion control support and newReno + * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 88fa2d160685..5f979c7f5135 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_max_reordering __read_mostly = 300; EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > tp->reordering) { int mib_idx; - tp->reordering = min(TCP_MAX_REORDERING, metric); + tp->reordering = min(sysctl_tcp_max_reordering, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) @@ -5030,7 +5031,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ /* step 4: Check for a SYN - * RFC 5691 4.2 : Send a challenge ack + * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { syn_challenge: @@ -5867,7 +5868,7 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable - * TCP ECN negociation. + * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control; it requires setting ECT on all packets, @@ -5877,20 +5878,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) + const struct sock *listen_sk, + const struct dst_entry *dst) { const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; + bool ect, need_ecn, ecn_ok; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); need_ecn = tcp_ca_needs_ecn(listen_sk); + ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + if (!ect && !need_ecn && ecn_ok) inet_rsk(req)->ecn_ok = 1; else if (ect && need_ecn) inet_rsk(req)->ecn_ok = 1; @@ -5955,13 +5958,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; - if (!want_cookie || tmp_opt.tstamp_ok) - tcp_ecn_create_request(req, skb, sk); - - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -6009,6 +6006,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; } + tcp_ecn_create_request(req, skb, sk, dst); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + if (!tmp_opt.tstamp_ok) + inet_rsk(req)->ecn_ok = 0; + } + tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5b90f2f447a5..9d7930ba8e0f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -94,9 +94,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | - SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3d453b94747..0b88158dd4a7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk); + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || - tcp_ca_needs_ecn(sk)) { + + if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd0db5471bb5..df19027f44f3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1281,8 +1281,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), @@ -1777,14 +1777,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (ret > 0) return -ret; return 0; - } else { - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); - - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); } + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); + + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { int ret; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6480cea7aa53..d3e537ef6b7f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -29,7 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features), - __be16 new_protocol) + __be16 new_protocol, bool is_ipv6) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -39,7 +39,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t enc_features; int udp_offset, outer_hlen; unsigned int oldlen; - bool need_csum; + bool need_csum = !!(skb_shinfo(skb)->gso_type & + SKB_GSO_UDP_TUNNEL_CSUM); + bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + bool offload_csum = false, dont_encap = (need_csum || remcsum); oldlen = (u16)~skb->len; @@ -52,10 +55,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + skb->encap_hdr_csum = need_csum; + skb->remcsum_offload = remcsum; - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + (skb->dev->features & + (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM))); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & features; @@ -72,11 +78,21 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, do { struct udphdr *uh; int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + __be32 delta; + + if (dont_encap) { + skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; + } else { + /* Only set up inner headers if we might be offloading + * inner checksum. + */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } skb->mac_len = mac_len; + skb->protocol = protocol; skb_push(skb, outer_hlen); skb_reset_mac_header(skb); @@ -86,19 +102,36 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, uh = udp_hdr(skb); uh->len = htons(len); - if (need_csum) { - __be32 delta = htonl(oldlen + len); + if (!need_csum) + continue; - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); + delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + if (offload_csum) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + } else if (remcsum) { + /* Need to calculate checksum from scratch, + * inner checksums are never when doing + * remote_checksum_offload. + */ + + skb->csum = skb_checksum(skb, udp_offset, + skb->len - udp_offset, + 0); + uh->check = csum_fold(skb->csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } - - skb->protocol = protocol; } while ((skb = skb->next)); out: return segs; @@ -134,7 +167,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, } segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, - protocol); + protocol, is_ipv6); out_unlock: rcu_read_unlock(); @@ -172,9 +205,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0169ccf5aa4f..06e897832a7a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1170,6 +1170,9 @@ enum { IPV6_SADDR_RULE_PRIVACY, IPV6_SADDR_RULE_ORCHID, IPV6_SADDR_RULE_PREFIX, +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + IPV6_SADDR_RULE_NOT_OPTIMISTIC, +#endif IPV6_SADDR_RULE_MAX }; @@ -1197,6 +1200,15 @@ static inline int ipv6_saddr_preferred(int type) return 0; } +static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +{ +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; +#else + return false; +#endif +} + static int ipv6_get_saddr_eval(struct net *net, struct ipv6_saddr_score *score, struct ipv6_saddr_dst *dst, @@ -1257,10 +1269,16 @@ static int ipv6_get_saddr_eval(struct net *net, score->scopedist = ret; break; case IPV6_SADDR_RULE_PREFERRED: + { /* Rule 3: Avoid deprecated and optimistic addresses */ + u8 avoid = IFA_F_DEPRECATED; + + if (!ipv6_use_optimistic_addr(score->ifa->idev)) + avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || - !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); + !(score->ifa->flags & avoid); break; + } #ifdef CONFIG_IPV6_MIP6 case IPV6_SADDR_RULE_HOA: { @@ -1306,6 +1324,14 @@ static int ipv6_get_saddr_eval(struct net *net, ret = score->ifa->prefix_len; score->matchlen = ret; break; +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + case IPV6_SADDR_RULE_NOT_OPTIMISTIC: + /* Optimistic addresses still have lower precedence than other + * preferred addresses. + */ + ret = !(score->ifa->flags & IFA_F_OPTIMISTIC); + break; +#endif default: ret = 0; } @@ -2315,8 +2341,8 @@ ok: else stored_lft = 0; if (!update_lft && !create && stored_lft) { - const u32 minimum_lft = min( - stored_lft, (u32)MIN_VALID_LIFETIME); + const u32 minimum_lft = min_t(u32, + stored_lft, MIN_VALID_LIFETIME); valid_lft = max(valid_lft, minimum_lft); /* RFC4862 Section 5.5.3e: @@ -3222,8 +3248,15 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) * Optimistic nodes can start receiving * Frames right away */ - if (ifp->flags & IFA_F_OPTIMISTIC) + if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); + if (ipv6_use_optimistic_addr(idev)) { + /* Because optimistic nodes can use this address, + * notify listeners. If DAD fails, RTM_DELADDR is sent. + */ + ipv6_ifa_notify(RTM_NEWADDR, ifp); + } + } addrconf_dad_kick(ifp); out: @@ -4330,6 +4363,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; + array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; @@ -5156,6 +5190,14 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec, }, + { + .procname = "use_optimistic", + .data = &ipv6_devconf.use_optimistic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, #endif #ifdef CONFIG_IPV6_MROUTE { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2cdc38338be3..5c6996e44b14 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 83fc3a385a26..d21d7b22eebc 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -544,12 +544,12 @@ static int esp_init_authenc(struct xfrm_state *x) BUG_ON(!aalg_desc); err = -EINVAL; - if (aalg_desc->uinfo.auth.icv_fullbits/8 != + if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits / 8); goto free_key; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index bfde361b6134..601d896f22d0 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -47,7 +47,7 @@ #include <net/xfrm.h> #endif -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * Parsing tlv encoded headers. diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 97ae70077a4f..62c1037d9e83 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1009,4 +1009,3 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) return table; } #endif - diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3dd7d4ebd7cd..7221021b2d97 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -769,10 +769,9 @@ static void ip6fl_seq_stop(struct seq_file *seq, void *v) static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); - if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n", - "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); - else { + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); + } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4564e1fca3eb..f6e2533c1145 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -902,7 +902,7 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device_stats *stats = &t->dev->stats; int ret; - if (!ip6_tnl_xmit_ctl(t)) + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; switch (skb->protocol) { diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index a071563a7e6e..fd76ce938c32 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -78,7 +78,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_MPLS | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e950c250ada..916d2a166a9b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -747,13 +747,11 @@ slow_path: if (len < left) { len &= ~7; } - /* - * Allocate buffer. - */ - if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC)) == NULL) { - NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); + /* Allocate buffer */ + frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC); + if (!frag) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9cb94cfa0ae7..e2b6cfba873c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -183,6 +183,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && @@ -190,6 +191,22 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ (t->dev->flags & IFF_UP)) return t; } + + memset(&any, 0, sizeof(any)); + hash = HASH(&any, local); + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + (t->dev->flags & IFF_UP)) + return t; + } + + hash = HASH(remote, &any); + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; @@ -474,6 +491,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, int rel_msg = 0; u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; + u8 tproto; __u32 rel_info = 0; __u16 len; int err = -ENOENT; @@ -487,7 +505,8 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, &ipv6h->saddr)) == NULL) goto out; - if (t->parms.proto != ipproto && t->parms.proto != 0) + tproto = ACCESS_ONCE(t->parms.proto); + if (tproto != ipproto && tproto != 0) goto out; err = 0; @@ -788,6 +807,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + u8 tproto; int err; rcu_read_lock(); @@ -796,7 +816,8 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, &ipv6h->daddr)) != NULL) { struct pcpu_sw_netstats *tstats; - if (t->parms.proto != ipproto && t->parms.proto != 0) { + tproto = ACCESS_ONCE(t->parms.proto); + if (tproto != ipproto && tproto != 0) { rcu_read_unlock(); goto discard; } @@ -902,24 +923,28 @@ ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } -int ip6_tnl_xmit_ctl(struct ip6_tnl *t) +int ip6_tnl_xmit_ctl(struct ip6_tnl *t, + const struct in6_addr *laddr, + const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; - if (p->flags & IP6_TNL_F_CAP_XMIT) { + if ((p->flags & IP6_TNL_F_CAP_XMIT) || + ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && + (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); - if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0))) + if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) pr_warn("%s xmit: Local address not yet configured!\n", p->name); - else if (!ipv6_addr_is_multicast(&p->raddr) && - unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0))) + else if (!ipv6_addr_is_multicast(raddr) && + unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else @@ -968,8 +993,34 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, u8 proto; int err = -1; - if (!fl6->flowi6_mark) + /* NBMA tunnel */ + if (ipv6_addr_any(&t->parms.raddr)) { + struct in6_addr *addr6; + struct neighbour *neigh; + int addr_type; + + if (!skb_dst(skb)) + goto tx_err_link_failure; + + neigh = dst_neigh_lookup(skb_dst(skb), + &ipv6_hdr(skb)->daddr); + if (!neigh) + goto tx_err_link_failure; + + addr6 = (struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) + addr6 = &ipv6_hdr(skb)->daddr; + + memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); + neigh_release(neigh); + } else if (!fl6->flowi6_mark) dst = ip6_tnl_dst_check(t); + + if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) + goto tx_err_link_failure; + if (!dst) { ndst = ip6_route_output(net, NULL, fl6); @@ -1075,10 +1126,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) struct flowi6 fl6; __u8 dsfield; __u32 mtu; + u8 tproto; int err; - if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) || - !ip6_tnl_xmit_ctl(t)) + tproto = ACCESS_ONCE(t->parms.proto); + if (tproto != IPPROTO_IPIP && tproto != 0) return -1; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) @@ -1117,10 +1169,12 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) struct flowi6 fl6; __u8 dsfield; __u32 mtu; + u8 tproto; int err; - if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h)) + tproto = ACCESS_ONCE(t->parms.proto); + if ((tproto != IPPROTO_IPV6 && tproto != 0) || + ip6_tnl_addr_conflict(t, ipv6h)) return -1; offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); @@ -1282,6 +1336,14 @@ static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) return err; } +static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +{ + /* for default tnl0 device allow to change only the proto */ + t->parms.proto = p->proto; + netdev_state_change(t->dev); + return 0; +} + static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { @@ -1381,7 +1443,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); - if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + if (cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; @@ -1389,8 +1451,10 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } } else t = netdev_priv(dev); - - err = ip6_tnl_update(t, &p1); + if (dev == ip6n->fb_tnl_dev) + err = ip6_tnl0_update(t, &p1); + else + err = ip6_tnl_update(t, &p1); } if (t) { err = 0; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 31089d153fd3..ec84d03491c7 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -412,6 +412,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct net_device_stats *stats = &t->dev->stats; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; + struct xfrm_state *x; int err = -1; if (!dst) @@ -425,7 +426,12 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) goto tx_err_link_failure; } - if (!vti6_state_check(dst->xfrm, &t->parms.raddr, &t->parms.laddr)) + x = dst->xfrm; + if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr)) + goto tx_err_link_failure; + + if (!ip6_tnl_xmit_ctl(t, (const struct in6_addr *)&x->props.saddr, + (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; tdev = dst->dev; @@ -480,7 +486,7 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) ipv6h = ipv6_hdr(skb); if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h)) + vti6_addr_conflict(t, ipv6h)) goto tx_err; xfrm_decode_session(skb, &fl, AF_INET6); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0171f08325c3..467f310dbbb3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2090,7 +2090,7 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; - /* For an (*,G) entry, we only check that the incomming + /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9648de2b6745..5ce107c8aab3 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1550,7 +1550,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, hdr->daddr = *daddr; } -static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) +static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { struct net_device *dev = idev->dev; struct net *net = dev_net(dev); @@ -1561,13 +1561,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu + hlen + tlen; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - size += hlen + tlen; /* limit our allocations to order-0 page */ size = min_t(int, size, SKB_MAX_ORDER(0, 0)); skb = sock_alloc_send_skb(sk, size, 1, &err); @@ -1576,6 +1576,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) return NULL; skb->priority = TC_PRIO_CONTROL; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { @@ -1690,8 +1692,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) @@ -2823,11 +2824,7 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "%3s %6s " - "%32s %32s %6s %6s\n", "Idx", - "Device", "Multicast Address", - "Source Address", "INC", "EXC"); + seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 896af8807979..0cbcf98f2cab 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, } if (skb_csum_unnecessary(skb)) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); if (err == -EINVAL) @@ -548,7 +548,8 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, if (!rp->checksum) goto send; - if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + skb = skb_peek(&sk->sk_write_queue); + if (!skb) goto out; offset = rp->offset; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 1a157ca2ebc1..51ab096ae574 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -69,7 +69,7 @@ struct ip6frag_skb_cb { #define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) -static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) +static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } @@ -178,7 +178,7 @@ static void ip6_frag_expire(unsigned long data) ip6_expire_frag_queue(net, fq, &ip6_frags); } -static __inline__ struct frag_queue * +static struct frag_queue * fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6_addr *dst, u8 ecn) { @@ -684,21 +684,21 @@ static void ip6_frags_sysctl_unregister(void) unregister_net_sysctl_table(ip6_ctl_header); } #else -static inline int ip6_frags_ns_sysctl_register(struct net *net) +static int ip6_frags_ns_sysctl_register(struct net *net) { return 0; } -static inline void ip6_frags_ns_sysctl_unregister(struct net *net) +static void ip6_frags_ns_sysctl_unregister(struct net *net) { } -static inline int ip6_frags_sysctl_register(void) +static int ip6_frags_sysctl_register(void) { return 0; } -static inline void ip6_frags_sysctl_unregister(void) +static void ip6_frags_sysctl_unregister(void) { } #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a318dd89b6d9..c91083156edb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -772,23 +772,22 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } #endif -#define BACKTRACK(__net, saddr) \ -do { \ - if (rt == __net->ipv6.ip6_null_entry) { \ - struct fib6_node *pn; \ - while (1) { \ - if (fn->fn_flags & RTN_TL_ROOT) \ - goto out; \ - pn = fn->parent; \ - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \ - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \ - else \ - fn = pn; \ - if (fn->fn_flags & RTN_RTINFO) \ - goto restart; \ - } \ - } \ -} while (0) +static struct fib6_node* fib6_backtrack(struct fib6_node *fn, + struct in6_addr *saddr) +{ + struct fib6_node *pn; + while (1) { + if (fn->fn_flags & RTN_TL_ROOT) + return NULL; + pn = fn->parent; + if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) + fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + else + fn = pn; + if (fn->fn_flags & RTN_RTINFO) + return fn; + } +} static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, @@ -804,8 +803,11 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); - BACKTRACK(net, &fl6->saddr); -out: + if (rt == net->ipv6.ip6_null_entry) { + fn = fib6_backtrack(fn, &fl6->saddr); + if (fn) + goto restart; + } dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); return rt; @@ -915,33 +917,48 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { - struct fib6_node *fn; + struct fib6_node *fn, *saved_fn; struct rt6_info *rt, *nrt; int strict = 0; int attempts = 3; int err; - int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; strict |= flags & RT6_LOOKUP_F_IFACE; + if (net->ipv6.devconf_all->forwarding == 0) + strict |= RT6_LOOKUP_F_REACHABLE; -relookup: +redo_fib6_lookup_lock: read_lock_bh(&table->tb6_lock); -restart_2: fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + saved_fn = fn; -restart: - rt = rt6_select(fn, oif, strict | reachable); +redo_rt6_select: + rt = rt6_select(fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, strict | reachable); - BACKTRACK(net, &fl6->saddr); - if (rt == net->ipv6.ip6_null_entry || - rt->rt6i_flags & RTF_CACHE) - goto out; + rt = rt6_multipath_select(rt, fl6, oif, strict); + if (rt == net->ipv6.ip6_null_entry) { + fn = fib6_backtrack(fn, &fl6->saddr); + if (fn) + goto redo_rt6_select; + else if (strict & RT6_LOOKUP_F_REACHABLE) { + /* also consider unreachable route */ + strict &= ~RT6_LOOKUP_F_REACHABLE; + fn = saved_fn; + goto redo_rt6_select; + } else { + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + goto out2; + } + } dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); + if (rt->rt6i_flags & RTF_CACHE) + goto out2; + if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); else if (!(rt->dst.flags & DST_HOST)) @@ -967,15 +984,8 @@ restart: * released someone could insert this route. Relookup. */ ip6_rt_put(rt); - goto relookup; + goto redo_fib6_lookup_lock; -out: - if (reachable) { - reachable = 0; - goto restart_2; - } - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); out2: rt->dst.lastuse = jiffies; rt->dst.__use++; @@ -1235,10 +1245,12 @@ restart: rt = net->ipv6.ip6_null_entry; else if (rt->dst.error) { rt = net->ipv6.ip6_null_entry; - goto out; + } else if (rt == net->ipv6.ip6_null_entry) { + fn = fib6_backtrack(fn, &fl6->saddr); + if (fn) + goto restart; } - BACKTRACK(net, &fl6->saddr); -out: + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a24557a1c1d8..660496de6125 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1711,7 +1711,7 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.dport)) + tunnel->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 2f25cb6347ca..7337fc7947e2 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -166,13 +166,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) int mss; struct dst_entry *dst; __u8 rcv_wscale; - bool ecn_ok = false; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; - if (tcp_synq_no_recent_overflow(sk) || - (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { + if (tcp_synq_no_recent_overflow(sk)) + goto out; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie); + if (mss == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } @@ -183,7 +185,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + if (!cookie_timestamp_decode(&tcp_opt)) goto out; ret = NULL; @@ -220,7 +222,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->expires = 0UL; req->num_retrans = 0; - ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; @@ -261,6 +262,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); ret = get_cookie_sock(sk, skb, req, dst); out: @@ -269,4 +271,3 @@ out_free: reqsk_free(req); return NULL; } - diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6ba535b6feb..9b6809232b17 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -424,8 +424,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 6b8f543f6ac6..b6aa8ed18257 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,11 +42,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_MPLS) || + SKB_GSO_SIT) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 91729b807c7d..a0c75366c93b 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -306,7 +306,7 @@ void ipxitf_down(struct ipx_interface *intrfc) spin_unlock_bh(&ipx_interfaces_lock); } -static __inline__ void __ipxitf_put(struct ipx_interface *intrfc) +static void __ipxitf_put(struct ipx_interface *intrfc) { if (atomic_dec_and_test(&intrfc->refcnt)) __ipxitf_down(intrfc); @@ -1805,8 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, - copied); + rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied); if (rc) goto out_free; if (skb->tstamp.tv64) diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c index e15c16a517e7..c1d247ebe916 100644 --- a/net/ipx/ipx_proc.c +++ b/net/ipx/ipx_proc.c @@ -45,7 +45,7 @@ static int ipx_seq_interface_show(struct seq_file *seq, void *v) } i = list_entry(v, struct ipx_interface, node); - seq_printf(seq, "%08lX ", (unsigned long int)ntohl(i->if_netnum)); + seq_printf(seq, "%08X ", ntohl(i->if_netnum)); seq_printf(seq, "%02X%02X%02X%02X%02X%02X ", i->if_node[0], i->if_node[1], i->if_node[2], i->if_node[3], i->if_node[4], i->if_node[5]); @@ -87,10 +87,10 @@ static int ipx_seq_route_show(struct seq_file *seq, void *v) rt = list_entry(v, struct ipx_route, node); - seq_printf(seq, "%08lX ", (unsigned long int)ntohl(rt->ir_net)); + seq_printf(seq, "%08X ", ntohl(rt->ir_net)); if (rt->ir_routed) - seq_printf(seq, "%08lX %02X%02X%02X%02X%02X%02X\n", - (long unsigned int)ntohl(rt->ir_intrfc->if_netnum), + seq_printf(seq, "%08X %02X%02X%02X%02X%02X%02X\n", + ntohl(rt->ir_intrfc->if_netnum), rt->ir_router_node[0], rt->ir_router_node[1], rt->ir_router_node[2], rt->ir_router_node[3], rt->ir_router_node[4], rt->ir_router_node[5]); @@ -194,19 +194,19 @@ static int ipx_seq_socket_show(struct seq_file *seq, void *v) s = v; ipxs = ipx_sk(s); #ifdef CONFIG_IPX_INTERN - seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", - (unsigned long)ntohl(ipxs->intrfc->if_netnum), + seq_printf(seq, "%08X:%02X%02X%02X%02X%02X%02X:%04X ", + ntohl(ipxs->intrfc->if_netnum), ipxs->node[0], ipxs->node[1], ipxs->node[2], ipxs->node[3], ipxs->node[4], ipxs->node[5], ntohs(ipxs->port)); #else - seq_printf(seq, "%08lX:%04X ", (unsigned long) ntohl(ipxs->intrfc->if_netnum), + seq_printf(seq, "%08X:%04X ", ntohl(ipxs->intrfc->if_netnum), ntohs(ipxs->port)); #endif /* CONFIG_IPX_INTERN */ if (s->sk_state != TCP_ESTABLISHED) seq_printf(seq, "%-28s", "Not_Connected"); else { - seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", - (unsigned long)ntohl(ipxs->dest_addr.net), + seq_printf(seq, "%08X:%02X%02X%02X%02X%02X%02X:%04X ", + ntohl(ipxs->dest_addr.net), ipxs->dest_addr.node[0], ipxs->dest_addr.node[1], ipxs->dest_addr.node[2], ipxs->dest_addr.node[3], ipxs->dest_addr.node[4], ipxs->dest_addr.node[5], diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c index ad7c03dedaab..0dafcc561ed6 100644 --- a/net/ipx/sysctl_net_ipx.c +++ b/net/ipx/sysctl_net_ipx.c @@ -9,14 +9,12 @@ #include <linux/mm.h> #include <linux/sysctl.h> #include <net/net_namespace.h> +#include <net/ipx.h> #ifndef CONFIG_SYSCTL #error This file should not be compiled without CONFIG_SYSCTL defined #endif -/* From af_ipx.c */ -extern int sysctl_ipx_pprop_broadcasting; - static struct ctl_table ipx_table[] = { { .procname = "ipx_pprop_broadcasting", diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 92fafd485deb..980bc2670a13 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, copied = size; msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a089b6b91650..057b5647ef92 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN; cskb = skb; - if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, offset, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/key/af_key.c b/net/key/af_key.c index 1847ec4e3930..e5883091a8c6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 369a9822488c..a6cc1fed2b52 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0edb263cc002..2177b960da87 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b704a9356208..c559bcdf4679 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 3cdaa046c1bc..fc60d9d738b5 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -73,6 +73,7 @@ static void __lapb_remove_cb(struct lapb_cb *lapb) lapb_put(lapb); } } +EXPORT_SYMBOL(lapb_register); /* * Add a socket to the bound sockets list. @@ -195,6 +196,7 @@ out: write_unlock_bh(&lapb_list_lock); return rc; } +EXPORT_SYMBOL(lapb_unregister); int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) { @@ -227,6 +229,7 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) out: return rc; } +EXPORT_SYMBOL(lapb_getparms); int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) { @@ -262,6 +265,7 @@ out_put: out: return rc; } +EXPORT_SYMBOL(lapb_setparms); int lapb_connect_request(struct net_device *dev) { @@ -290,6 +294,7 @@ out_put: out: return rc; } +EXPORT_SYMBOL(lapb_connect_request); int lapb_disconnect_request(struct net_device *dev) { @@ -334,6 +339,7 @@ out_put: out: return rc; } +EXPORT_SYMBOL(lapb_disconnect_request); int lapb_data_request(struct net_device *dev, struct sk_buff *skb) { @@ -355,6 +361,7 @@ out_put: out: return rc; } +EXPORT_SYMBOL(lapb_data_request); int lapb_data_received(struct net_device *dev, struct sk_buff *skb) { @@ -369,6 +376,7 @@ int lapb_data_received(struct net_device *dev, struct sk_buff *skb) return rc; } +EXPORT_SYMBOL(lapb_data_received); void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) { @@ -415,15 +423,6 @@ int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) return used; } -EXPORT_SYMBOL(lapb_register); -EXPORT_SYMBOL(lapb_unregister); -EXPORT_SYMBOL(lapb_getparms); -EXPORT_SYMBOL(lapb_setparms); -EXPORT_SYMBOL(lapb_connect_request); -EXPORT_SYMBOL(lapb_disconnect_request); -EXPORT_SYMBOL(lapb_data_request); -EXPORT_SYMBOL(lapb_data_received); - static int __init lapb_init(void) { return 0; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index bb9cbc17d926..af662669f951 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -819,8 +819,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, used = len; if (!(flags & MSG_TRUNC)) { - int rc = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 25c31c0a3fdb..6daf391b3e84 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -15,7 +15,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> -#include <asm/errno.h> +#include <linux/errno.h> #include <net/llc_if.h> #include <net/llc_sap.h> #include <net/llc_s_ev.h> diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index e3545f21a099..ca27837974fe 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -34,8 +34,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_MPLS))) + SKB_GSO_IPIP))) goto out; /* Setup inner SKB. */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1de72de273e..580b79452bec 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } skb_reset_transport_header(data_skb); - err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1b06a1fcf3e8..7e13f6afcd1f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 51f077a92fa9..83bc785d5855 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, rlen, len); cskb = skb; - if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 11c3544ea546..9d7d2b7ba5e4 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ba3bb8203b99..454ce12efbbf 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -29,11 +29,12 @@ config OPENVSWITCH If unsure, say N. config OPENVSWITCH_GRE - bool "Open vSwitch GRE tunneling support" + tristate "Open vSwitch GRE tunneling support" + select NET_MPLS_GSO depends on INET depends on OPENVSWITCH - depends on NET_IPGRE_DEMUX && !(OPENVSWITCH=y && NET_IPGRE_DEMUX=m) - default y + depends on NET_IPGRE_DEMUX + default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create GRE vport. @@ -43,11 +44,11 @@ config OPENVSWITCH_GRE If unsure, say Y. config OPENVSWITCH_VXLAN - bool "Open vSwitch VXLAN tunneling support" + tristate "Open vSwitch VXLAN tunneling support" depends on INET depends on OPENVSWITCH - depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m) - default y + depends on VXLAN + default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create vxlan vport. @@ -56,11 +57,11 @@ config OPENVSWITCH_VXLAN If unsure, say Y. config OPENVSWITCH_GENEVE - bool "Open vSwitch Geneve tunneling support" + tristate "Open vSwitch Geneve tunneling support" depends on INET depends on OPENVSWITCH - depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) - default y + depends on GENEVE + default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 9a33a273c375..91b9478413ef 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,14 +15,6 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o -ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) -openvswitch-y += vport-geneve.o -endif - -ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) -openvswitch-y += vport-vxlan.o -endif - -ifneq ($(CONFIG_OPENVSWITCH_GRE),) -openvswitch-y += vport-gre.o -endif +obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o +obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o +obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 006886dbee36..f7e589159e4a 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -28,10 +28,12 @@ #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> + #include <net/ip.h> #include <net/ipv6.h> #include <net/checksum.h> #include <net/dsfield.h> +#include <net/mpls.h> #include <net/sctp/checksum.h> #include "datapath.h" @@ -118,6 +120,92 @@ static int make_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } +static int push_mpls(struct sk_buff *skb, + const struct ovs_action_push_mpls *mpls) +{ + __be32 *new_mpls_lse; + struct ethhdr *hdr; + + /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -ENOTSUPP; + + if (skb_cow_head(skb, MPLS_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + + new_mpls_lse = (__be32 *)skb_mpls_header(skb); + *new_mpls_lse = mpls->mpls_lse; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, + MPLS_HLEN, 0)); + + hdr = eth_hdr(skb); + hdr->h_proto = mpls->mpls_ethertype; + + skb_set_inner_protocol(skb, skb->protocol); + skb->protocol = mpls->mpls_ethertype; + + return 0; +} + +static int pop_mpls(struct sk_buff *skb, const __be16 ethertype) +{ + struct ethhdr *hdr; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, + csum_partial(skb_mpls_header(skb), + MPLS_HLEN, 0)); + + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + + /* skb_mpls_header() is used to locate the ethertype + * field correctly in the presence of VLAN tags. + */ + hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr->h_proto = ethertype; + if (eth_p_mpls(skb->protocol)) + skb->protocol = ethertype; + return 0; +} + +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse) +{ + __be32 *stack; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + stack = (__be32 *)skb_mpls_header(skb); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), *mpls_lse }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + *stack = *mpls_lse; + + return 0; +} + /* remove VLAN header from packet and update csum accordingly. */ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) { @@ -140,10 +228,12 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) skb_set_network_header(skb, ETH_HLEN); - skb_reset_mac_len(skb); + /* Update mac_len for subsequent MPLS actions */ + skb_reset_mac_len(skb); return 0; } @@ -186,6 +276,8 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) return -ENOMEM; + /* Update mac_len for subsequent MPLS actions */ + skb->mac_len += VLAN_HLEN; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(skb->data @@ -459,21 +551,14 @@ static int set_sctp(struct sk_buff *skb, return 0; } -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) { - struct vport *vport; - - if (unlikely(!skb)) - return -ENOMEM; + struct vport *vport = ovs_vport_rcu(dp, out_port); - vport = ovs_vport_rcu(dp, out_port); - if (unlikely(!vport)) { + if (likely(vport)) + ovs_vport_send(vport, skb); + else kfree_skb(skb); - return -ENODEV; - } - - ovs_vport_send(vport, skb); - return 0; } static int output_userspace(struct datapath *dp, struct sk_buff *skb, @@ -504,11 +589,6 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, &upcall); } -static bool last_action(const struct nlattr *a, int rem) -{ - return a->nla_len == rem; -} - static int sample(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { @@ -543,7 +623,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, * user space. This skb will be consumed by its caller. */ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && - last_action(a, rem))) + nla_is_last(a, rem))) return output_userspace(dp, skb, key, a); skb = skb_clone(skb, GFP_ATOMIC); @@ -617,6 +697,10 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_SCTP: err = set_sctp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, nla_data(nested_attr)); + break; } return err; @@ -633,7 +717,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, if (err) return err; - if (!last_action(a, rem)) { + if (!nla_is_last(a, rem)) { /* Recirc action is the not the last action * of the action list, need to clone the skb. */ @@ -677,8 +761,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; - if (prev_port != -1) { - do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); + if (unlikely(prev_port != -1)) { + struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); + + if (out_skb) + do_output(dp, out_skb, prev_port); + prev_port = -1; } @@ -695,6 +783,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, nla_get_be16(a)); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -707,7 +803,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_RECIRC: err = execute_recirc(dp, skb, key, a, rem); - if (last_action(a, rem)) { + if (nla_is_last(a, rem)) { /* If this is the last action, the skb has * been consumed or freed. * Return immediately. @@ -769,14 +865,11 @@ static void process_deferred_actions(struct datapath *dp) /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key) + struct sw_flow_actions *acts, struct sw_flow_key *key) { int level = this_cpu_read(exec_actions_level); - struct sw_flow_actions *acts; int err; - acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); - this_cpu_inc(exec_actions_level); OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e6d7255183eb..014485ec4b0d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -59,6 +59,7 @@ #include "vport-netdev.h" int ovs_net_id __read_mostly; +EXPORT_SYMBOL(ovs_net_id); static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; @@ -130,6 +131,7 @@ int lockdep_ovsl_is_held(void) else return 1; } +EXPORT_SYMBOL(lockdep_ovsl_is_held); #endif static struct vport *new_vport(const struct vport_parms *); @@ -138,19 +140,30 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *, static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *); -/* Must be called with rcu_read_lock or ovs_mutex. */ -static struct datapath *get_dp(struct net *net, int dp_ifindex) +/* Must be called with rcu_read_lock. */ +static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) { - struct datapath *dp = NULL; - struct net_device *dev; + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, dp_ifindex); if (dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); if (vport) - dp = vport->dp; + return vport->dp; } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); rcu_read_unlock(); return dp; @@ -185,6 +198,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); + ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -243,6 +257,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; @@ -268,10 +283,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) goto out; } - OVS_CB(skb)->flow = flow; + ovs_flow_stats_update(flow, key->tp.flags, skb); + sf_acts = rcu_dereference(flow->sf_acts); + ovs_execute_actions(dp, skb, sf_acts, key); - ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb); - ovs_execute_actions(dp, skb, key); stats_counter = &stats->n_hit; out: @@ -360,37 +375,12 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, return err; } -static size_t key_attr_size(void) -{ - return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ - + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ - + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ - + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ - + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ - + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */ - + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ - + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ -} - static size_t upcall_msg_size(const struct nlattr *userdata, unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ if (userdata) @@ -510,6 +500,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; @@ -552,25 +543,18 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (err) goto err_flow_free; - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); - err = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], - &flow->key, 0, &acts); + &flow->key, &acts); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; - OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -583,9 +567,10 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_unlock; OVS_CB(packet)->input_vport = input_vport; + sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); - err = ovs_execute_actions(dp, packet, &flow->key); + err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); local_bh_enable(); rcu_read_unlock(); @@ -662,8 +647,8 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ @@ -671,58 +656,67 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) } /* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, - struct sk_buff *skb, u32 portid, - u32 seq, u32 flags, u8 cmd) +static int ovs_flow_cmd_fill_match(const struct sw_flow *flow, + struct sk_buff *skb) { - const int skb_orig_len = skb->len; - struct nlattr *start; - struct ovs_flow_stats stats; - __be16 tcp_flags; - unsigned long used; - struct ovs_header *ovs_header; struct nlattr *nla; int err; - ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); - if (!ovs_header) - return -EMSGSIZE; - - ovs_header->dp_ifindex = dp_ifindex; - /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) - goto nla_put_failure; + return -EMSGSIZE; err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); if (err) - goto error; + return err; + nla_nest_end(skb, nla); + /* Fill flow mask. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); if (!nla) - goto nla_put_failure; + return -EMSGSIZE; err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); if (err) - goto error; + return err; nla_nest_end(skb, nla); + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, + struct sk_buff *skb) +{ + struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) - goto nla_put_failure; + return -EMSGSIZE; if (stats.n_packets && nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) - goto nla_put_failure; + return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) - goto nla_put_failure; + return -EMSGSIZE; + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, + struct sk_buff *skb, int skb_orig_len) +{ + struct nlattr *start; + int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for @@ -746,17 +740,47 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, nla_nest_end(skb, start); else { if (skb_orig_len) - goto error; + return err; nla_nest_cancel(skb, start); } - } else if (skb_orig_len) - goto nla_put_failure; + } else if (skb_orig_len) { + return -EMSGSIZE; + } + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd) +{ + const int skb_orig_len = skb->len; + struct ovs_header *ovs_header; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = dp_ifindex; + + err = ovs_flow_cmd_fill_match(flow, skb); + if (err) + goto error; + + err = ovs_flow_cmd_fill_stats(flow, skb); + if (err) + goto error; + + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; return genlmsg_end(skb, ovs_header); -nla_put_failure: - err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; @@ -814,10 +838,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Must have key and actions. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow key attribute not present in new flow.\n"); goto error; - if (!a[OVS_FLOW_ATTR_ACTIONS]) + } + if (!a[OVS_FLOW_ATTR_ACTIONS]) { + OVS_NLERR("Flow actions attribute not present in new flow.\n"); goto error; + } /* Most of the time we need to allocate a new flow, do it before * locking. @@ -838,16 +866,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); /* Validate actions. */ - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_kfree_flow; - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - 0, &acts); + &acts); if (error) { OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree_acts; + goto err_kfree_flow; } reply = ovs_flow_cmd_alloc_info(acts, info, false); @@ -938,6 +961,7 @@ error: return error; } +/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask) @@ -946,15 +970,10 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, struct sw_flow_key masked_key; int error; - acts = ovs_nla_alloc_flow_actions(nla_len(a)); - if (IS_ERR(acts)) - return acts; - ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, 0, &acts); + error = ovs_nla_copy_actions(a, &masked_key, &acts); if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - kfree(acts); + OVS_NLERR("Actions may not be safe on all matching packets.\n"); return ERR_PTR(error); } @@ -976,8 +995,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow key attribute not present in set flow.\n"); goto error; + } ovs_match_init(&match, &key, &mask); error = ovs_nla_get_match(&match, @@ -992,10 +1013,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) error = PTR_ERR(acts); goto error; } - } - /* Can allocate before locking if have acts. */ - if (acts) { + /* Can allocate before locking if have acts. */ reply = ovs_flow_cmd_alloc_info(acts, info, false); if (IS_ERR(reply)) { error = PTR_ERR(reply); @@ -1179,7 +1198,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) struct datapath *dp; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; @@ -1442,7 +1461,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(&dp->table, false); + ovs_flow_tbl_destroy(&dp->table); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -1474,8 +1493,6 @@ static void __dp_destroy(struct datapath *dp) ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); /* RCU destroy the flow table */ - ovs_flow_tbl_destroy(&dp->table, true); - call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1764,6 +1781,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); +restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) @@ -1795,8 +1813,11 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) vport = new_vport(&parms); err = PTR_ERR(vport); - if (IS_ERR(vport)) + if (IS_ERR(vport)) { + if (err == -EAGAIN) + goto restart; goto exit_unlock_free; + } err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); @@ -1939,7 +1960,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, j = 0; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; @@ -2112,12 +2133,18 @@ static int __init dp_init(void) if (err) goto error_netns_exit; + err = ovs_netdev_init(); + if (err) + goto error_unreg_notifier; + err = dp_register_genl(); if (err < 0) - goto error_unreg_notifier; + goto error_unreg_netdev; return 0; +error_unreg_netdev: + ovs_netdev_exit(); error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: @@ -2137,6 +2164,7 @@ error: static void dp_cleanup(void) { dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); rcu_barrier(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 974135439c5c..1c56a80d6677 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -94,14 +94,12 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @flow: The flow associated with this packet. May be %NULL if no flow. * @egress_tun_key: Tunnel information about this packet on egress path. * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. */ struct ovs_skb_cb { - struct sw_flow *flow; struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; }; @@ -194,7 +192,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *); + struct sw_flow_actions *acts, struct sw_flow_key *); void ovs_dp_notify_wq(struct work_struct *work); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2b78789ea7c5..90a21010fc8f 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -32,6 +32,7 @@ #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/mpls.h> #include <linux/sctp.h> #include <linux/smp.h> #include <linux/tcp.h> @@ -42,6 +43,7 @@ #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> +#include <net/mpls.h> #include <net/ndisc.h> #include "datapath.h" @@ -480,6 +482,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -ENOMEM; skb_reset_network_header(skb); + skb_reset_mac_len(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); /* Network layer. */ @@ -584,6 +587,33 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ip, 0, sizeof(key->ip)); memset(&key->ipv4, 0, sizeof(key->ipv4)); } + } else if (eth_p_mpls(key->eth.type)) { + size_t stack_len = MPLS_HLEN; + + /* In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * Advance network_header to the beginning of the L3 + * header. mac_len corresponds to the end of the L2 header. + */ + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + stack_len); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + + if (stack_len == MPLS_HLEN) + memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + + skb_set_network_header(skb, skb->mac_len + stack_len); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + stack_len += MPLS_HLEN; + } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 71813318c8c7..4962bee81a11 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -102,12 +102,17 @@ struct sw_flow_key { __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ __be16 type; /* Ethernet frame type. */ } eth; - struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 939bcb32100f..ed3109761827 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -46,24 +46,22 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> +#include <net/mpls.h> #include "flow_netlink.h" -static void update_range__(struct sw_flow_match *match, - size_t offset, size_t size, bool is_mask) +static void update_range(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) { - struct sw_flow_key_range *range = NULL; + struct sw_flow_key_range *range; size_t start = rounddown(offset, sizeof(long)); size_t end = roundup(offset + size, sizeof(long)); if (!is_mask) range = &match->range; - else if (match->mask) + else range = &match->mask->range; - if (!range) - return; - if (range->start == range->end) { range->start = start; range->end = end; @@ -79,22 +77,20 @@ static void update_range__(struct sw_flow_match *match, #define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - (match)->mask->key.field = value; \ - } else { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + (match)->mask->key.field = value; \ + else \ (match)->key->field = value; \ - } \ } while (0) #define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ do { \ - update_range__(match, offset, len, is_mask); \ + update_range(match, offset, len, is_mask); \ if (is_mask) \ memcpy((u8 *)&(match)->mask->key + offset, value_p, \ - len); \ + len); \ else \ memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) @@ -103,18 +99,16 @@ static void update_range__(struct sw_flow_match *match, SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ value_p, len, is_mask) -#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memset((u8 *)&(match)->mask->key.field, value,\ - sizeof((match)->mask->key.field)); \ - } else { \ +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + memset((u8 *)&(match)->mask->key.field, value, \ + sizeof((match)->mask->key.field)); \ + else \ memset((u8 *)&(match)->key->field, value, \ sizeof((match)->key->field)); \ - } \ } while (0) static bool match_validate(const struct sw_flow_match *match, @@ -134,7 +128,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMP) | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) - | (1 << OVS_KEY_ATTR_ND)); + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -149,6 +144,12 @@ static bool match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_ARP; } + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + if (match->key->eth.type == htons(ETH_P_IP)) { key_expected |= 1 << OVS_KEY_ATTR_IPV4; if (match->mask && (match->mask->key.eth.type == htons(0xffff))) @@ -244,6 +245,38 @@ static bool match_validate(const struct sw_flow_match *match, return true; } +size_t ovs_key_attr_size(void) +{ + /* Whenever adding new OVS_KEY_ FIELDS, we should consider + * updating this function. + */ + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ENCAP] = -1, @@ -266,6 +299,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, + [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), }; static bool is_all_zero(const u8 *fp, size_t size) @@ -572,10 +606,13 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (is_mask) + if (is_mask) { in_port = 0xffffffff; /* Always exact match in_port. */ - else if (in_port >= DP_MAX_PORTS) + } else if (in_port >= DP_MAX_PORTS) { + OVS_NLERR("Port (%d) exceeds maximum allowable (%d).\n", + in_port, DP_MAX_PORTS); return -EINVAL; + } SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); @@ -602,7 +639,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct nlattr **a, bool is_mask) { int err; - u64 orig_attrs = attrs; err = metadata_from_nlattrs(match, &attrs, a, is_mask); if (err) @@ -634,8 +670,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_VLAN); - } else if (!is_mask) - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + } if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { __be16 eth_type; @@ -735,6 +770,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + SW_FLOW_KEY_PUT(match, mpls.top_lse, + mpls_key->mpls_lse, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { const struct ovs_key_tcp *tcp_key; @@ -745,15 +790,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, } if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - } else { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - } + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); } @@ -812,8 +851,11 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ND); } - if (attrs != 0) + if (attrs != 0) { + OVS_NLERR("Unknown key attributes (%llx).\n", + (unsigned long long)attrs); return -EINVAL; + } return 0; } @@ -853,8 +895,8 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * attribute specifies the mask field of the wildcarded flow. */ int ovs_nla_get_match(struct sw_flow_match *match, - const struct nlattr *key, - const struct nlattr *mask) + const struct nlattr *nla_key, + const struct nlattr *nla_mask) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; const struct nlattr *encap; @@ -864,7 +906,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, bool encap_valid = false; int err; - err = parse_flow_nlattrs(key, a, &key_attrs); + err = parse_flow_nlattrs(nla_key, a, &key_attrs); if (err) return err; @@ -905,36 +947,43 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (err) return err; - if (match->mask && !mask) { - /* Create an exact match mask. We need to set to 0xff all the - * 'match->mask' fields that have been touched in 'match->key'. - * We cannot simply memset 'match->mask', because padding bytes - * and fields not specified in 'match->key' should be left to 0. - * Instead, we use a stream of netlink attributes, copied from - * 'key' and set to 0xff: ovs_key_from_nlattrs() will take care - * of filling 'match->mask' appropriately. - */ - newmask = kmemdup(key, nla_total_size(nla_len(key)), - GFP_KERNEL); - if (!newmask) - return -ENOMEM; + if (match->mask) { + if (!nla_mask) { + /* Create an exact match mask. We need to set to 0xff + * all the 'match->mask' fields that have been touched + * in 'match->key'. We cannot simply memset + * 'match->mask', because padding bytes and fields not + * specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, + * copied from 'key' and set to 0xff. + * ovs_key_from_nlattrs() will take care of filling + * 'match->mask' appropriately. + */ + newmask = kmemdup(nla_key, + nla_total_size(nla_len(nla_key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; - mask_set_nlattr(newmask, 0xff); + mask_set_nlattr(newmask, 0xff); - /* The userspace does not send tunnel attributes that are 0, - * but we should not wildcard them nonetheless. - */ - if (match->key->tun_key.ipv4_dst) - SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); + /* The userspace does not send tunnel attributes that + * are 0, but we should not wildcard them nonetheless. + */ + if (match->key->tun_key.ipv4_dst) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, + 0xff, true); - mask = newmask; - } + nla_mask = newmask; + } - if (mask) { - err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs); if (err) goto free_newmask; + /* Always match on tci. */ + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { __be16 eth_type = 0; __be16 tci = 0; @@ -1140,6 +1189,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, arp_key->arp_op = htons(output->ip.proto); ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = output->mpls.top_lse; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1226,12 +1283,14 @@ nla_put_failure: #define MAX_ACTIONS_BUFSIZE (32 * 1024) -struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) +static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - if (size > MAX_ACTIONS_BUFSIZE) + if (size > MAX_ACTIONS_BUFSIZE) { + OVS_NLERR("Flow action size (%u bytes) exceeds maximum", size); return ERR_PTR(-EINVAL); + } sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -1269,7 +1328,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = ovs_nla_alloc_flow_actions(new_acts_size); + acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return (void *)acts; @@ -1336,9 +1395,15 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci); + static int validate_and_copy_sample(const struct nlattr *attr, const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -1375,7 +1440,8 @@ static int validate_and_copy_sample(const struct nlattr *attr, if (st_acts < 0) return st_acts; - err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); + err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + eth_type, vlan_tci); if (err) return err; @@ -1385,10 +1451,10 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } -static int validate_tp_port(const struct sw_flow_key *flow_key) +static int validate_tp_port(const struct sw_flow_key *flow_key, + __be16 eth_type) { - if ((flow_key->eth.type == htons(ETH_P_IP) || - flow_key->eth.type == htons(ETH_P_IPV6)) && + if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) && (flow_key->tp.src || flow_key->tp.dst)) return 0; @@ -1483,7 +1549,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun) + bool *set_tun, __be16 eth_type) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -1508,6 +1574,9 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_TUNNEL: + if (eth_p_mpls(eth_type)) + return -EINVAL; + *set_tun = true; err = validate_and_copy_set_tun(a, sfa); if (err) @@ -1515,7 +1584,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) + if (eth_type != htons(ETH_P_IP)) return -EINVAL; if (!flow_key->ip.proto) @@ -1531,7 +1600,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) + if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; if (!flow_key->ip.proto) @@ -1553,19 +1622,24 @@ static int validate_set(const struct nlattr *a, if (flow_key->ip.proto != IPPROTO_TCP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); case OVS_KEY_ATTR_UDP: if (flow_key->ip.proto != IPPROTO_UDP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; case OVS_KEY_ATTR_SCTP: if (flow_key->ip.proto != IPPROTO_SCTP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); default: return -EINVAL; @@ -1609,12 +1683,13 @@ static int copy_action(const struct nlattr *from, return 0; } -int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) { const struct nlattr *a; + bool out_tnl_port = false; int rem, err; if (depth >= SAMPLE_ACTION_DEPTH) @@ -1626,6 +1701,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, @@ -1655,6 +1732,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_OUTPUT: if (nla_get_u32(a) >= DP_MAX_PORTS) return -EINVAL; + out_tnl_port = false; + break; case OVS_ACTION_ATTR_HASH: { @@ -1671,6 +1750,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_VLAN: + vlan_tci = htons(0); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -1679,25 +1759,73 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; + vlan_tci = vlan->vlan_tci; break; case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + /* Networking stack do not allow simultaneous Tunnel + * and MPLS GSO. + */ + if (out_tnl_port) + return -EINVAL; + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions is planned using packet + * recirculation. + */ + eth_type = htons(0); + break; + case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); + err = validate_set(a, key, sfa, + &out_tnl_port, eth_type); if (err) return err; + + skip_copy = out_tnl_port; break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); + err = validate_and_copy_sample(a, key, depth, sfa, + eth_type, vlan_tci); if (err) return err; skip_copy = true; break; default: + OVS_NLERR("Unknown tunnel attribute (%d).\n", type); return -EINVAL; } if (!skip_copy) { @@ -1713,6 +1841,24 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return 0; } +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa) +{ + int err; + + *sfa = nla_alloc_flow_actions(nla_len(attr)); + if (IS_ERR(*sfa)) + return PTR_ERR(*sfa); + + err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + key->eth.tci); + if (err) + kfree(*sfa); + + return err; +} + static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { const struct nlattr *a; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 206e45add888..eb0b177300ad 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -37,6 +37,8 @@ #include "flow.h" +size_t ovs_key_attr_size(void); + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, struct sw_flow_mask *mask); @@ -49,12 +51,11 @@ int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *); int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, + const struct sw_flow_key *key, struct sw_flow_actions **sfa); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); -struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len); void ovs_nla_free_flow_actions(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf2d853646f0..90f8b40a350b 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -250,11 +250,14 @@ skip_flows: __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +/* No need for locking this function is called from RCU callback or + * error path. + */ +void ovs_flow_tbl_destroy(struct flow_table *table) { - struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ti = rcu_dereference_raw(table->ti); - table_instance_destroy(ti, deferred); + table_instance_destroy(ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 5918bff7f3f6..f682c8c07f44 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -62,7 +62,7 @@ void ovs_flow_free(struct sw_flow *, bool deferred); int ovs_flow_tbl_init(struct flow_table *); int ovs_flow_tbl_count(struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +void ovs_flow_tbl_destroy(struct flow_table *table); int ovs_flow_tbl_flush(struct flow_table *flow_table); int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 106a9d80b663..70c9765011f4 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -17,6 +17,7 @@ #include <linux/rculist.h> #include <linux/udp.h> #include <linux/if_vlan.h> +#include <linux/module.h> #include <net/geneve.h> #include <net/icmp.h> @@ -28,6 +29,8 @@ #include "datapath.h" #include "vport.h" +static struct vport_ops ovs_geneve_vport_ops; + /** * struct geneve_port - Keeps track of open UDP ports * @gs: The socket created for this port number. @@ -225,11 +228,29 @@ static const char *geneve_get_name(const struct vport *vport) return geneve_port->name; } -const struct vport_ops ovs_geneve_vport_ops = { +static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, .create = geneve_tnl_create, .destroy = geneve_tnl_destroy, .get_name = geneve_get_name, .get_options = geneve_get_options, .send = geneve_tnl_send, + .owner = THIS_MODULE, }; + +static int __init ovs_geneve_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_geneve_vport_ops); +} + +static void __exit ovs_geneve_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_geneve_vport_ops); +} + +module_init(ovs_geneve_tnl_init); +module_exit(ovs_geneve_tnl_exit); + +MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-5"); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 108b82da2fd9..00270b608844 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -29,6 +29,7 @@ #include <linux/jhash.h> #include <linux/list.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <net/route.h> @@ -45,6 +46,8 @@ #include "datapath.h" #include "vport.h" +static struct vport_ops ovs_gre_vport_ops; + /* Returns the least-significant 32 bits of a __be64. */ static __be32 be64_get_low32(__be64 x) { @@ -281,10 +284,28 @@ static void gre_tnl_destroy(struct vport *vport) gre_exit(); } -const struct vport_ops ovs_gre_vport_ops = { +static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, .destroy = gre_tnl_destroy, .get_name = gre_get_name, .send = gre_tnl_send, + .owner = THIS_MODULE, }; + +static int __init ovs_gre_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_gre_vport_ops); +} + +static void __exit ovs_gre_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_gre_vport_ops); +} + +module_init(ovs_gre_tnl_init); +module_exit(ovs_gre_tnl_exit); + +MODULE_DESCRIPTION("OVS: GRE switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-3"); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 84516126e5f3..6a55f7105505 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -36,6 +36,8 @@ struct internal_dev { struct vport *vport; }; +static struct vport_ops ovs_internal_vport_ops; + static struct internal_dev *internal_dev_priv(struct net_device *netdev) { return netdev_priv(netdev); @@ -222,6 +224,11 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) struct net_device *netdev = netdev_vport_priv(vport)->dev; int len; + if (unlikely(!(netdev->flags & IFF_UP))) { + kfree_skb(skb); + return 0; + } + len = skb->len; skb_dst_drop(skb); @@ -238,7 +245,7 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) return len; } -const struct vport_ops ovs_internal_vport_ops = { +static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, @@ -261,10 +268,21 @@ struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) int ovs_internal_dev_rtnl_link_register(void) { - return rtnl_link_register(&internal_dev_link_ops); + int err; + + err = rtnl_link_register(&internal_dev_link_ops); + if (err < 0) + return err; + + err = ovs_vport_ops_register(&ovs_internal_vport_ops); + if (err < 0) + rtnl_link_unregister(&internal_dev_link_ops); + + return err; } void ovs_internal_dev_rtnl_link_unregister(void) { + ovs_vport_ops_unregister(&ovs_internal_vport_ops); rtnl_link_unregister(&internal_dev_link_ops); } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index d21f77d875ba..877ee74b4f08 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -33,6 +33,8 @@ #include "vport-internal_dev.h" #include "vport-netdev.h" +static struct vport_ops ovs_netdev_vport_ops; + /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) { @@ -224,10 +226,20 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev) return NULL; } -const struct vport_ops ovs_netdev_vport_ops = { +static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .get_name = ovs_netdev_get_name, .send = netdev_send, }; + +int __init ovs_netdev_init(void) +{ + return ovs_vport_ops_register(&ovs_netdev_vport_ops); +} + +void ovs_netdev_exit(void) +{ + ovs_vport_ops_unregister(&ovs_netdev_vport_ops); +} diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 8df01c1127e5..6f7038e79c52 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -41,4 +41,7 @@ netdev_vport_priv(const struct vport *vport) const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); +int __init ovs_netdev_init(void); +void ovs_netdev_exit(void); + #endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 2735e01dca73..965e7500c5a6 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -24,6 +24,7 @@ #include <linux/net.h> #include <linux/rculist.h> #include <linux/udp.h> +#include <linux/module.h> #include <net/icmp.h> #include <net/ip.h> @@ -50,6 +51,8 @@ struct vxlan_port { char name[IFNAMSIZ]; }; +static struct vport_ops ovs_vxlan_vport_ops; + static inline struct vxlan_port *vxlan_vport(const struct vport *vport) { return vport_priv(vport); @@ -192,11 +195,29 @@ static const char *vxlan_get_name(const struct vport *vport) return vxlan_port->name; } -const struct vport_ops ovs_vxlan_vport_ops = { +static struct vport_ops ovs_vxlan_vport_ops = { .type = OVS_VPORT_TYPE_VXLAN, .create = vxlan_tnl_create, .destroy = vxlan_tnl_destroy, .get_name = vxlan_get_name, .get_options = vxlan_get_options, .send = vxlan_tnl_send, + .owner = THIS_MODULE, }; + +static int __init ovs_vxlan_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_vport_ops); +} + +static void __exit ovs_vxlan_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); +} + +module_init(ovs_vxlan_tnl_init); +module_exit(ovs_vxlan_tnl_exit); + +MODULE_DESCRIPTION("OVS: VXLAN switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6015802ebe6f..8168ef021337 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -28,6 +28,7 @@ #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> +#include <linux/module.h> #include "datapath.h" #include "vport.h" @@ -36,22 +37,7 @@ static void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); -/* List of statically compiled vport implementations. Don't forget to also - * add yours to the list at the bottom of vport.h. */ -static const struct vport_ops *vport_ops_list[] = { - &ovs_netdev_vport_ops, - &ovs_internal_vport_ops, - -#ifdef CONFIG_OPENVSWITCH_GRE - &ovs_gre_vport_ops, -#endif -#ifdef CONFIG_OPENVSWITCH_VXLAN - &ovs_vxlan_vport_ops, -#endif -#ifdef CONFIG_OPENVSWITCH_GENEVE - &ovs_geneve_vport_ops, -#endif -}; +static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; @@ -88,6 +74,32 @@ static struct hlist_head *hash_bucket(struct net *net, const char *name) return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } +int ovs_vport_ops_register(struct vport_ops *ops) +{ + int err = -EEXIST; + struct vport_ops *o; + + ovs_lock(); + list_for_each_entry(o, &vport_ops_list, list) + if (ops->type == o->type) + goto errout; + + list_add_tail(&ops->list, &vport_ops_list); + err = 0; +errout: + ovs_unlock(); + return err; +} +EXPORT_SYMBOL(ovs_vport_ops_register); + +void ovs_vport_ops_unregister(struct vport_ops *ops) +{ + ovs_lock(); + list_del(&ops->list); + ovs_unlock(); +} +EXPORT_SYMBOL(ovs_vport_ops_unregister); + /** * ovs_vport_locate - find a port that has already been created * @@ -153,6 +165,7 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return vport; } +EXPORT_SYMBOL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport @@ -173,6 +186,18 @@ void ovs_vport_free(struct vport *vport) free_percpu(vport->percpu_stats); kfree(vport); } +EXPORT_SYMBOL(ovs_vport_free); + +static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) +{ + struct vport_ops *ops; + + list_for_each_entry(ops, &vport_ops_list, list) + if (ops->type == parms->type) + return ops; + + return NULL; +} /** * ovs_vport_add - add vport device (for kernel callers) @@ -184,31 +209,40 @@ void ovs_vport_free(struct vport *vport) */ struct vport *ovs_vport_add(const struct vport_parms *parms) { + struct vport_ops *ops; struct vport *vport; - int err = 0; - int i; - for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { - if (vport_ops_list[i]->type == parms->type) { - struct hlist_head *bucket; + ops = ovs_vport_lookup(parms); + if (ops) { + struct hlist_head *bucket; - vport = vport_ops_list[i]->create(parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto out; - } + if (!try_module_get(ops->owner)) + return ERR_PTR(-EAFNOSUPPORT); - bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); - hlist_add_head_rcu(&vport->hash_node, bucket); + vport = ops->create(parms); + if (IS_ERR(vport)) { + module_put(ops->owner); return vport; } + + bucket = hash_bucket(ovs_dp_get_net(vport->dp), + vport->ops->get_name(vport)); + hlist_add_head_rcu(&vport->hash_node, bucket); + return vport; } - err = -EAFNOSUPPORT; + /* Unlock to attempt module load and return -EAGAIN if load + * was successful as we need to restart the port addition + * workflow. + */ + ovs_unlock(); + request_module("vport-type-%d", parms->type); + ovs_lock(); -out: - return ERR_PTR(err); + if (!ovs_vport_lookup(parms)) + return ERR_PTR(-EAFNOSUPPORT); + else + return ERR_PTR(-EAGAIN); } /** @@ -242,6 +276,8 @@ void ovs_vport_del(struct vport *vport) hlist_del_rcu(&vport->hash_node); vport->ops->destroy(vport); + + module_put(vport->ops->owner); } /** @@ -457,6 +493,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, } ovs_dp_process_packet(skb, &key); } +EXPORT_SYMBOL(ovs_vport_receive); /** * ovs_vport_send - send a packet on a device @@ -535,3 +572,4 @@ void ovs_vport_deferred_free(struct vport *vport) call_rcu(&vport->rcu, free_vport_rcu); } +EXPORT_SYMBOL(ovs_vport_deferred_free); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 8942125de3a6..e41c3facf799 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -161,6 +161,9 @@ struct vport_ops { const char *(*get_name)(const struct vport *); int (*send)(struct vport *, struct sk_buff *); + + struct module *owner; + struct list_head list; }; enum vport_err_type { @@ -209,14 +212,6 @@ static inline struct vport *vport_from_priv(void *priv) void ovs_vport_receive(struct vport *, struct sk_buff *, struct ovs_tunnel_info *); -/* List of statically compiled vport implementations. Don't forget to also - * add yours to the list at the top of vport.c. */ -extern const struct vport_ops ovs_netdev_vport_ops; -extern const struct vport_ops ovs_internal_vport_ops; -extern const struct vport_ops ovs_gre_vport_ops; -extern const struct vport_ops ovs_vxlan_vport_ops; -extern const struct vport_ops ovs_geneve_vport_ops; - static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { @@ -224,4 +219,7 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } +int ovs_vport_ops_register(struct vport_ops *ops); +void ovs_vport_ops_unregister(struct vport_ops *ops); + #endif /* vport.h */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 87d20f48ff06..4cd13d8de44b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 290352c0e6b4..0918bc21eae6 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, copylen = len; } - rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen); + rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 70a547ea5177..44b2123e22b8 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1296,7 +1296,7 @@ copy: else len = skb->len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a85c1a086ae4..9b600c20a7a3 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index e9aaa65c0778..4575485ad1b4 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); + ret = skb_copy_datagram_msg(skb, offset, msg, copy); if (ret < 0) goto copy_error; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index d6bcbd9f7791..7fffc2272701 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -1,5 +1,5 @@ /* - * net/sched/gact.c Generic actions + * net/sched/act_gact.c Generic actions * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8a64a0734aee..cbc8dd7dd48a 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -1,5 +1,5 @@ /* - * net/sched/ipt.c iptables target interface + * net/sched/act_ipt.c iptables target interface * *TODO: Add other tables. For now we only support the ipv4 table targets * diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index eb48306033d9..5953517ec059 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -1,5 +1,5 @@ /* - * net/sched/mirred.c packet mirroring and redirect actions + * net/sched/act_mirred.c packet mirroring and redirect actions * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 5f9bcb2e080b..59649d588d79 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -1,5 +1,5 @@ /* - * net/sched/pedit.c Generic packet editor + * net/sched/act_pedit.c Generic packet editor * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 69791ca77a05..9a1c42a43f92 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -1,5 +1,5 @@ /* - * net/sched/police.c Input police filter. + * net/sched/act_police.c Input police filter * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 992c2317ce88..6a8d9488613a 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -1,5 +1,5 @@ /* - * net/sched/simp.c Simple example of an action + * net/sched/act_simple.c Simple example of an action * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b34331967e02..179f1c8c0d8b 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -139,33 +139,20 @@ struct netem_sched_data { /* Time stamp put into socket buffer control block * Only valid when skbs are in our internal t(ime)fifo queue. + * + * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, + * and skb->next & skb->prev are scratch space for a qdisc, + * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { psched_time_t time_to_send; ktime_t tstamp_save; }; -/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp - * to hold a rb_node structure. - * - * If struct sk_buff layout is changed, the following checks will complain. - */ -static struct rb_node *netem_rb_node(struct sk_buff *skb) -{ - BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); - BUILD_BUG_ON(offsetof(struct sk_buff, prev) != - offsetof(struct sk_buff, next) + sizeof(skb->next)); - BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != - offsetof(struct sk_buff, prev) + sizeof(skb->prev)); - BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + - sizeof(skb->prev) + - sizeof(skb->tstamp)); - return (struct rb_node *)&skb->next; -} static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) { - return (struct sk_buff *)rb; + return container_of(rb, struct sk_buff, rbnode); } static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) @@ -403,8 +390,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) else p = &parent->rb_left; } - rb_link_node(netem_rb_node(nskb), parent, p); - rb_insert_color(netem_rb_node(nskb), &q->t_root); + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->t_root); sch->q.qlen++; } diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 34229ee7f379..0697eda5aed8 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -417,7 +417,7 @@ static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " - "REM_ADDR_RTX START\n"); + "REM_ADDR_RTX START STATE\n"); return (void *)pos; } @@ -490,14 +490,20 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) * Note: We don't have a way to tally this at the moment * so lets just leave it as zero for the moment */ - seq_printf(seq, "0 "); + seq_puts(seq, "0 "); /* * remote address start time (START). This is also not * currently implemented, but we can record it with a * jiffies marker in a subsequent patch */ - seq_printf(seq, "0"); + seq_puts(seq, "0 "); + + /* + * The current state of this destination. I.e. + * SCTP_ACTIVE, SCTP_INACTIVE, ... + */ + seq_printf(seq, "%d", tsp->state); seq_printf(seq, "\n"); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 634a2abb5f3a..2120292c842d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, if (copied > len) copied = len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); event = sctp_skb2event(skb); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 74745a47d72a..ec18076e81ec 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -91,7 +91,7 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, * @*headbuf: in: NULL for first frag, otherwise value returned from prev call * out: set when successful non-complete reassembly, otherwise NULL * @*buf: in: the buffer to append. Always defined - * out: head buf after sucessful complete reassembly, otherwise NULL + * out: head buf after successful complete reassembly, otherwise NULL * Returns 1 when reassembly complete, otherwise 0 */ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) @@ -311,7 +311,7 @@ bool tipc_msg_bundle(struct sk_buff *bbuf, struct sk_buff *buf, u32 mtu) * @mtu: max allowable size for the bundle buffer, inclusive header * @dnode: destination node for message. (Not always present in header) * Replaces buffer if successful - * Returns true if sucess, otherwise false + * Returns true if success, otherwise false */ bool tipc_msg_make_bundle(struct sk_buff **buf, u32 mtu, u32 dnode) { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 51bddc236a15..591bbfa082a0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1372,8 +1372,7 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg), - m->msg_iov, sz); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz); if (res) goto exit; res = sz; @@ -1473,8 +1472,8 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset, - m->msg_iov, sz_to_copy); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, + m, sz_to_copy); if (res) goto exit; @@ -1556,7 +1555,7 @@ static void tipc_data_ready(struct sock *sk) * @tsk: TIPC socket * @msg: message * - * Returns 0 (TIPC_OK) if everyting ok, -TIPC_ERR_NO_PORT otherwise + * Returns 0 (TIPC_OK) if everything ok, -TIPC_ERR_NO_PORT otherwise */ static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e96884380732..5eee625d113f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); + err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; @@ -2030,8 +2030,8 @@ again: } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, - msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + msg, chunk)) { if (copied == 0) copied = -EFAULT; break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 9bb63ffec4f2..a57ddef7d5af 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, } /* Place the datagram payload in the user's iovec. */ - err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, - payload_len); + err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len); if (err) goto out; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5ad4418ef093..59e785bfde65 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; |