summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c3
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/9p/trans_fd.c4
-rw-r--r--net/9p/trans_xen.c8
-rw-r--r--net/batman-adv/distributed-arp-table.c5
-rw-r--r--net/batman-adv/routing.c2
-rw-r--r--net/batman-adv/soft-interface.c5
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/bnep/core.c2
-rw-r--r--net/bluetooth/cmtp/core.c2
-rw-r--r--net/bluetooth/hidp/core.c2
-rw-r--r--net/bridge/br_device.c2
-rw-r--r--net/bridge/br_netlink.c9
-rw-r--r--net/bridge/br_stp_if.c2
-rw-r--r--net/bridge/br_stp_timer.c2
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c3
-rw-r--r--net/bridge/netfilter/ebtables.c9
-rw-r--r--net/caif/caif_socket.c4
-rw-r--r--net/caif/cfpkt_skbuff.c6
-rw-r--r--net/caif/chnl_net.c4
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/ceph/auth_x.c13
-rw-r--r--net/ceph/ceph_common.c34
-rw-r--r--net/ceph/cls_lock_client.c51
-rw-r--r--net/ceph/debugfs.c7
-rw-r--r--net/ceph/messenger.c26
-rw-r--r--net/ceph/mon_client.c4
-rw-r--r--net/ceph/osd_client.c139
-rw-r--r--net/ceph/osdmap.c1
-rw-r--r--net/ceph/pagelist.c2
-rw-r--r--net/ceph/snapshot.c6
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c131
-rw-r--r--net/core/dev_ioctl.c19
-rw-r--r--net/core/devlink.c8
-rw-r--r--net/core/dst.c37
-rw-r--r--net/core/fib_rules.c21
-rw-r--r--net/core/filter.c1
-rw-r--r--net/core/neighbour.c14
-rw-r--r--net/core/net_namespace.c19
-rw-r--r--net/core/rtnetlink.c93
-rw-r--r--net/core/skbuff.c5
-rw-r--r--net/core/sock.c23
-rw-r--r--net/core/sysctl_net_core.c2
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c8
-rw-r--r--net/decnet/dn_route.c14
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c4
-rw-r--r--net/dsa/dsa.c47
-rw-r--r--net/dsa/dsa2.c4
-rw-r--r--net/dsa/legacy.c47
-rw-r--r--net/hsr/hsr_device.c4
-rw-r--r--net/hsr/hsr_forward.c3
-rw-r--r--net/hsr/hsr_framereg.c9
-rw-r--r--net/hsr/hsr_framereg.h2
-rw-r--r--net/ieee802154/6lowpan/core.c2
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/arp.c48
-rw-r--r--net/ipv4/esp4.c5
-rw-r--r--net/ipv4/fib_frontend.c15
-rw-r--r--net/ipv4/fib_semantics.c17
-rw-r--r--net/ipv4/fib_trie.c26
-rw-r--r--net/ipv4/icmp.c8
-rw-r--r--net/ipv4/igmp.c22
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ipmr.c52
-rw-r--r--net/ipv4/route.c10
-rw-r--r--net/ipv4/tcp.c19
-rw-r--r--net/ipv4/tcp_cong.c1
-rw-r--r--net/ipv4/tcp_input.c11
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/udp_impl.h1
-rw-r--r--net/ipv6/addrconf.c16
-rw-r--r--net/ipv6/calipso.c6
-rw-r--r--net/ipv6/datagram.c8
-rw-r--r--net/ipv6/esp6_offload.c25
-rw-r--r--net/ipv6/fib6_rules.c22
-rw-r--r--net/ipv6/icmp.c2
-rw-r--r--net/ipv6/ila/ila_xlat.c1
-rw-r--r--net/ipv6/ip6_fib.c3
-rw-r--r--net/ipv6/ip6_gre.c22
-rw-r--r--net/ipv6/ip6_offload.c9
-rw-r--r--net/ipv6/ip6_output.c22
-rw-r--r--net/ipv6/ip6_tunnel.c34
-rw-r--r--net/ipv6/ip6_vti.c8
-rw-r--r--net/ipv6/ip6mr.c2
-rw-r--r--net/ipv6/output_core.c14
-rw-r--r--net/ipv6/ping.c2
-rw-r--r--net/ipv6/proc.c2
-rw-r--r--net/ipv6/raw.c2
-rw-r--r--net/ipv6/route.c7
-rw-r--r--net/ipv6/sit.c8
-rw-r--r--net/ipv6/tcp_ipv6.c4
-rw-r--r--net/ipv6/udp.c7
-rw-r--r--net/ipv6/udp_impl.h1
-rw-r--r--net/ipv6/udp_offload.c6
-rw-r--r--net/ipv6/xfrm6_input.c2
-rw-r--r--net/ipv6/xfrm6_mode_ro.c2
-rw-r--r--net/ipv6/xfrm6_mode_transport.c2
-rw-r--r--net/irda/irlan/irlan_eth.c2
-rw-r--r--net/key/af_key.c21
-rw-r--r--net/l2tp/l2tp_eth.c15
-rw-r--r--net/llc/af_llc.c5
-rw-r--r--net/llc/llc_conn.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/mac80211/agg-tx.c128
-rw-r--r--net/mac80211/cfg.c2
-rw-r--r--net/mac80211/ht.c16
-rw-r--r--net/mac80211/ieee80211_i.h16
-rw-r--r--net/mac80211/iface.c18
-rw-r--r--net/mac80211/mlme.c62
-rw-r--r--net/mac80211/rx.c9
-rw-r--r--net/mac80211/sta_info.c2
-rw-r--r--net/mac80211/sta_info.h2
-rw-r--r--net/mac80211/wpa.c9
-rw-r--r--net/mac802154/iface.c7
-rw-r--r--net/mpls/af_mpls.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c19
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/netfilter/nf_conntrack_helper.c12
-rw-r--r--net/netfilter/nf_conntrack_netlink.c18
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c9
-rw-r--r--net/netfilter/nf_nat_core.c6
-rw-r--r--net/netfilter/nf_tables_api.c160
-rw-r--r--net/netfilter/nfnetlink_cthelper.c17
-rw-r--r--net/netfilter/nft_bitwise.c19
-rw-r--r--net/netfilter/nft_cmp.c12
-rw-r--r--net/netfilter/nft_ct.c4
-rw-r--r--net/netfilter/nft_immediate.c5
-rw-r--r--net/netfilter/nft_range.c4
-rw-r--r--net/netfilter/nft_set_hash.c2
-rw-r--r--net/netfilter/nft_set_rbtree.c22
-rw-r--r--net/netfilter/x_tables.c24
-rw-r--r--net/netfilter/xt_CT.c6
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/openvswitch/conntrack.c4
-rw-r--r--net/openvswitch/vport-internal_dev.c4
-rw-r--r--net/packet/af_packet.c14
-rw-r--r--net/phonet/pep-gprs.c2
-rw-r--r--net/rxrpc/key.c64
-rw-r--r--net/sched/act_pedit.c4
-rw-r--r--net/sched/act_police.c8
-rw-r--r--net/sched/cls_matchall.c1
-rw-r--r--net/sched/sch_api.c9
-rw-r--r--net/sctp/associola.c4
-rw-r--r--net/sctp/endpointola.c1
-rw-r--r--net/sctp/input.c16
-rw-r--r--net/sctp/ipv6.c49
-rw-r--r--net/sctp/sctp_diag.c5
-rw-r--r--net/sctp/sm_make_chunk.c13
-rw-r--r--net/sctp/sm_statefuns.c3
-rw-r--r--net/sctp/socket.c9
-rw-r--r--net/smc/Kconfig4
-rw-r--r--net/smc/af_smc.c2
-rw-r--r--net/smc/smc_clc.c4
-rw-r--r--net/smc/smc_core.c16
-rw-r--r--net/smc/smc_core.h2
-rw-r--r--net/smc/smc_ib.c21
-rw-r--r--net/smc/smc_ib.h2
-rw-r--r--net/sunrpc/Kconfig1
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/sched.c5
-rw-r--r--net/sunrpc/svc.c134
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c6
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c8
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c71
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c89
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c79
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c512
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c978
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c110
-rw-r--r--net/sunrpc/xprtrdma/transport.c57
-rw-r--r--net/sunrpc/xprtrdma/verbs.c323
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h22
-rw-r--r--net/sunrpc/xprtsock.c7
-rw-r--r--net/tipc/msg.c2
-rw-r--r--net/tipc/socket.c38
-rw-r--r--net/unix/af_unix.c11
-rw-r--r--net/vmw_vsock/af_vsock.c21
-rw-r--r--net/vmw_vsock/virtio_transport.c6
-rw-r--r--net/wireless/scan.c8
-rw-r--r--net/wireless/util.c10
-rw-r--r--net/wireless/wext-core.c22
-rw-r--r--net/x25/af_x25.c24
-rw-r--r--net/x25/sysctl_net_x25.c5
-rw-r--r--net/xfrm/Makefile3
-rw-r--r--net/xfrm/xfrm_device.c4
-rw-r--r--net/xfrm/xfrm_policy.c51
-rw-r--r--net/xfrm/xfrm_state.c2
-rw-r--r--net/xfrm/xfrm_user.c1
196 files changed, 3056 insertions, 1891 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 467069b73ce1..9649579b5b9f 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -277,7 +277,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
return 0;
out_free_newdev:
- free_netdev(new_dev);
+ if (new_dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(new_dev);
return err;
}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 953b6728bd00..abc5f400fc71 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -813,7 +813,6 @@ static void vlan_dev_free(struct net_device *dev)
free_percpu(vlan->vlan_pcpu_stats);
vlan->vlan_pcpu_stats = NULL;
- free_netdev(dev);
}
void vlan_setup(struct net_device *dev)
@@ -826,7 +825,8 @@ void vlan_setup(struct net_device *dev)
netif_keep_dst(dev);
dev->netdev_ops = &vlan_netdev_ops;
- dev->destructor = vlan_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = vlan_dev_free;
dev->ethtool_ops = &vlan_ethtool_ops;
dev->min_mtu = 0;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 7bc2208b6cc4..dca3cdd1a014 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -95,7 +95,7 @@ enum {
struct p9_poll_wait {
struct p9_conn *conn;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
wait_queue_head_t *wait_addr;
};
@@ -522,7 +522,7 @@ error:
clear_bit(Wworksched, &m->wsched);
}
-static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key)
+static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
{
struct p9_poll_wait *pwait =
container_of(wait, struct p9_poll_wait, wait);
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 71e85643b3f9..6ad3e043c617 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -454,8 +454,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev,
goto error_xenbus;
}
priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL);
- if (!priv->tag) {
- ret = -EINVAL;
+ if (IS_ERR(priv->tag)) {
+ ret = PTR_ERR(priv->tag);
goto error_xenbus;
}
ret = xenbus_transaction_end(xbt, 0);
@@ -525,7 +525,7 @@ static struct xenbus_driver xen_9pfs_front_driver = {
.otherend_changed = xen_9pfs_front_changed,
};
-int p9_trans_xen_init(void)
+static int p9_trans_xen_init(void)
{
if (!xen_domain())
return -ENODEV;
@@ -537,7 +537,7 @@ int p9_trans_xen_init(void)
}
module_init(p9_trans_xen_init);
-void p9_trans_xen_exit(void)
+static void p9_trans_xen_exit(void)
{
v9fs_unregister_trans(&p9_xen_trans);
return xenbus_unregister_driver(&xen_9pfs_front_driver);
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 013e970eff39..000ca2f113ab 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1064,8 +1064,9 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
skb_new->protocol = eth_type_trans(skb_new, soft_iface);
- soft_iface->stats.rx_packets++;
- soft_iface->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN + hdr_size);
netif_rx(skb_new);
batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index e1ebe14ee2a6..ae9f4d37d34f 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -987,7 +987,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"recv_unicast_packet(): Dropped unicast pkt received from another backbone gw %pM.\n",
orig_addr_gw);
- return NET_RX_DROP;
+ goto free_skb;
}
}
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index b25789abf7b9..10f7edfb176e 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1034,8 +1034,6 @@ static void batadv_softif_free(struct net_device *dev)
* netdev and its private data (bat_priv)
*/
rcu_barrier();
-
- free_netdev(dev);
}
/**
@@ -1047,7 +1045,8 @@ static void batadv_softif_init_early(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &batadv_netdev_ops;
- dev->destructor = batadv_softif_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = batadv_softif_free;
dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
dev->priv_flags |= IFF_NO_QUEUE;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 608959989f8e..ab3b654b05cc 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -598,7 +598,7 @@ static void netdev_setup(struct net_device *dev)
dev->netdev_ops = &netdev_ops;
dev->header_ops = &header_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static struct device_type bt_type = {
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index fbf251fef70f..5c4808b3da2d 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -484,7 +484,7 @@ static int bnep_session(void *arg)
struct net_device *dev = s->dev;
struct sock *sk = s->sock->sk;
struct sk_buff *skb;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
BT_DBG("");
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 9e59b6654126..14f7c8135c31 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -280,7 +280,7 @@ static int cmtp_session(void *arg)
struct cmtp_session *session = arg;
struct sock *sk = session->sock->sk;
struct sk_buff *skb;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
BT_DBG("session %p", session);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 0bec4588c3c8..fc31161e98f2 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1244,7 +1244,7 @@ static void hidp_session_run(struct hidp_session *session)
static int hidp_session_thread(void *arg)
{
struct hidp_session *session = arg;
- wait_queue_t ctrl_wait, intr_wait;
+ wait_queue_entry_t ctrl_wait, intr_wait;
BT_DBG("session %p", session);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 430b53e7d941..f0f3447e8aa4 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -379,7 +379,7 @@ void br_dev_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &br_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->ethtool_ops = &br_ethtool_ops;
SET_NETDEV_DEVTYPE(dev, &br_type);
dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index c5ce7745b230..32bd3ead9ba1 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -595,7 +595,7 @@ static int br_afspec(struct net_bridge *br,
err = 0;
switch (nla_type(attr)) {
case IFLA_BRIDGE_VLAN_TUNNEL_INFO:
- if (!(p->flags & BR_VLAN_TUNNEL))
+ if (!p || !(p->flags & BR_VLAN_TUNNEL))
return -EINVAL;
err = br_parse_vlan_tunnel_info(attr, &tinfo_curr);
if (err)
@@ -835,6 +835,13 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
return -EPROTONOSUPPORT;
}
}
+
+ if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
+ __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
+
+ if (defpvid >= VLAN_VID_MASK)
+ return -EINVAL;
+ }
#endif
return 0;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 08341d2aa9c9..6f12a5271219 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -179,6 +179,8 @@ static void br_stp_start(struct net_bridge *br)
br_debug(br, "using kernel STP\n");
/* To start timers on any ports left in blocking */
+ if (br->dev->flags & IFF_UP)
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
br_port_state_selection(br);
}
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index c98b3e5c140a..60b6fe277a8b 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg)
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
- if (br->stp_enabled != BR_USER_STP)
+ if (br->stp_enabled == BR_KERNEL_STP)
mod_timer(&br->hello_timer,
round_jiffies(jiffies + br->hello_time));
}
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 5929309beaa1..db85230e49c3 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -68,6 +68,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
if (e->ethproto != htons(ETH_P_ARP) ||
e->invflags & EBT_IPROTO)
return -EINVAL;
+ if (ebt_invalid_target(info->target))
+ return -EINVAL;
+
return 0;
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9ec0c9f908fa..9c6e619f452b 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name,
strlcpy(name, _name, sizeof(name));
if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
- xt_data_to_user(um + entrysize, data, usersize, datasize))
+ xt_data_to_user(um + entrysize, data, usersize, datasize,
+ XT_ALIGN(datasize)))
return -EFAULT;
return 0;
@@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
if (match->compat_to_user(cm->data, m->data))
return -EFAULT;
} else {
- if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
+ if (xt_data_to_user(cm->data, m->data, match->usersize, msize,
+ COMPAT_XT_ALIGN(msize)))
return -EFAULT;
}
@@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target *t,
if (target->compat_to_user(cm->data, t->data))
return -EFAULT;
} else {
- if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+ if (xt_data_to_user(cm->data, t->data, target->usersize, tsize,
+ COMPAT_XT_ALIGN(tsize)))
return -EFAULT;
}
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index adcad344c843..21f18ea2fce4 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -754,6 +754,10 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
lock_sock(sk);
+ err = -EINVAL;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ goto out;
+
err = -EAFNOSUPPORT;
if (uaddr->sa_family != AF_CAIF)
goto out;
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 59ce1fcc220c..71b6ab240dea 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -81,11 +81,7 @@ static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx)
{
struct sk_buff *skb;
- if (likely(in_interrupt()))
- skb = alloc_skb(len + pfx, GFP_ATOMIC);
- else
- skb = alloc_skb(len + pfx, GFP_KERNEL);
-
+ skb = alloc_skb(len + pfx, GFP_ATOMIC);
if (unlikely(skb == NULL))
return NULL;
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 1816fc9f1ee7..fe3c53efb949 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -392,14 +392,14 @@ static void chnl_net_destructor(struct net_device *dev)
{
struct chnl_net *priv = netdev_priv(dev);
caif_free_client(&priv->chnl);
- free_netdev(dev);
}
static void ipcaif_net_setup(struct net_device *dev)
{
struct chnl_net *priv;
dev->netdev_ops = &netdev_ops;
- dev->destructor = chnl_net_destructor;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = chnl_net_destructor;
dev->flags |= IFF_NOARP;
dev->flags |= IFF_POINTOPOINT;
dev->mtu = GPRS_PDP_MTU;
diff --git a/net/can/af_can.c b/net/can/af_can.c
index b6406fe33c76..88edac0f3e36 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -872,8 +872,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
static int can_pernet_init(struct net *net)
{
- net->can.can_rcvlists_lock =
- __SPIN_LOCK_UNLOCKED(net->can.can_rcvlists_lock);
+ spin_lock_init(&net->can.can_rcvlists_lock);
net->can.can_rx_alldev_list =
kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 2034fb926670..8757fb87dab8 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -151,7 +151,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
struct timespec validity;
void *tp, *tpend;
void **ptp;
- struct ceph_crypto_key new_session_key;
+ struct ceph_crypto_key new_session_key = { 0 };
struct ceph_buffer *new_ticket_blob;
unsigned long new_expires, new_renew_after;
u64 new_secret_id;
@@ -215,6 +215,9 @@ static int process_one_ticket(struct ceph_auth_client *ac,
dout(" ticket blob is %d bytes\n", dlen);
ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad);
blob_struct_v = ceph_decode_8(ptp);
+ if (blob_struct_v != 1)
+ goto bad;
+
new_secret_id = ceph_decode_64(ptp);
ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend);
if (ret)
@@ -234,13 +237,13 @@ static int process_one_ticket(struct ceph_auth_client *ac,
type, ceph_entity_type_name(type), th->secret_id,
(int)th->ticket_blob->vec.iov_len);
xi->have_keys |= th->service;
-
-out:
- return ret;
+ return 0;
bad:
ret = -EINVAL;
- goto out;
+out:
+ ceph_crypto_key_destroy(&new_session_key);
+ return ret;
}
static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4eb773ccce11..47e94b560ba0 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -45,18 +45,16 @@ bool libceph_compatible(void *data)
}
EXPORT_SYMBOL(libceph_compatible);
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
+static int param_get_supported_features(char *buffer,
+ const struct kernel_param *kp)
{
- const char *e = s + len;
-
- while (e != s && *(e-1) != '/')
- e--;
- return e;
+ return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT);
}
-EXPORT_SYMBOL(ceph_file_part);
+static const struct kernel_param_ops param_ops_supported_features = {
+ .get = param_get_supported_features,
+};
+module_param_cb(supported_features, &param_ops_supported_features, NULL,
+ S_IRUGO);
const char *ceph_msg_type_name(int type)
{
@@ -596,9 +594,7 @@ EXPORT_SYMBOL(ceph_client_gid);
/*
* create a fresh client instance
*/
-struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
- u64 supported_features,
- u64 required_features)
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
{
struct ceph_client *client;
struct ceph_entity_addr *myaddr = NULL;
@@ -615,14 +611,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
init_waitqueue_head(&client->auth_wq);
client->auth_err = 0;
- if (!ceph_test_opt(client, NOMSGAUTH))
- required_features |= CEPH_FEATURE_MSG_AUTH;
-
client->extra_mon_dispatch = NULL;
- client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
- supported_features;
- client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
- required_features;
+ client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
+
+ if (!ceph_test_opt(client, NOMSGAUTH))
+ client->required_features |= CEPH_FEATURE_MSG_AUTH;
/* msgr */
if (ceph_test_opt(client, MYIP))
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b9233b990399..08ada893f01e 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_cls_break_lock);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *old_cookie,
+ char *tag, char *new_cookie)
+{
+ int cookie_op_buf_size;
+ int name_len = strlen(lock_name);
+ int old_cookie_len = strlen(old_cookie);
+ int tag_len = strlen(tag);
+ int new_cookie_len = strlen(new_cookie);
+ void *p, *end;
+ struct page *cookie_op_page;
+ int ret;
+
+ cookie_op_buf_size = name_len + sizeof(__le32) +
+ old_cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ new_cookie_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (cookie_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ cookie_op_page = alloc_page(GFP_NOIO);
+ if (!cookie_op_page)
+ return -ENOMEM;
+
+ p = page_address(cookie_op_page);
+ end = p + cookie_op_buf_size;
+
+ /* encode cls_lock_set_cookie_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, old_cookie, old_cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ ceph_encode_string(&p, end, new_cookie, new_cookie_len);
+
+ dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n",
+ __func__, lock_name, type, old_cookie, tag, new_cookie);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie",
+ CEPH_OSD_FLAG_WRITE, cookie_op_page,
+ cookie_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(cookie_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_set_cookie);
+
void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
{
int i;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c62b2b029a6e..71ba13927b3d 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p)
return 0;
down_read(&osdc->lock);
- seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
+ seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
+ osdc->epoch_barrier, map->flags);
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pi =
@@ -177,9 +178,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
seq_printf(s, "%llu\t", req->r_tid);
dump_target(s, &req->r_t);
- seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
- le32_to_cpu(req->r_replay_version.epoch),
- le64_to_cpu(req->r_replay_version.version));
+ seq_printf(s, "\t%d", req->r_attempts);
for (i = 0; i < req->r_num_ops; i++) {
struct ceph_osd_req_op *op = &req->r_ops[i];
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5766a6c896c4..588a91930051 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1174,8 +1174,8 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
* Returns true if the result moves the cursor on to the next piece
* of the data item.
*/
-static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
- size_t bytes)
+static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
{
bool new_piece;
@@ -1207,8 +1207,6 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
new_piece = true;
}
cursor->need_crc = new_piece;
-
- return new_piece;
}
static size_t sizeof_footer(struct ceph_connection *con)
@@ -1577,7 +1575,6 @@ static int write_partial_message_data(struct ceph_connection *con)
size_t page_offset;
size_t length;
bool last_piece;
- bool need_crc;
int ret;
page = ceph_msg_data_next(cursor, &page_offset, &length,
@@ -1592,7 +1589,7 @@ static int write_partial_message_data(struct ceph_connection *con)
}
if (do_datacrc && cursor->need_crc)
crc = ceph_crc32c_page(crc, page, page_offset, length);
- need_crc = ceph_msg_data_advance(cursor, (size_t)ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
}
dout("%s %p msg %p done\n", __func__, con, msg);
@@ -2231,10 +2228,18 @@ static void process_ack(struct ceph_connection *con)
struct ceph_msg *m;
u64 ack = le64_to_cpu(con->in_temp_ack);
u64 seq;
+ bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
+ struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
- while (!list_empty(&con->out_sent)) {
- m = list_first_entry(&con->out_sent, struct ceph_msg,
- list_head);
+ /*
+ * In the reconnect case, con_fault() has requeued messages
+ * in out_sent. We should cleanup old messages according to
+ * the reconnect seq.
+ */
+ while (!list_empty(list)) {
+ m = list_first_entry(list, struct ceph_msg, list_head);
+ if (reconnect && m->needs_out_seq)
+ break;
seq = le64_to_cpu(m->hdr.seq);
if (seq > ack)
break;
@@ -2243,6 +2248,7 @@ static void process_ack(struct ceph_connection *con)
m->ack_stamp = jiffies;
ceph_msg_remove(m);
}
+
prepare_read_tag(con);
}
@@ -2299,7 +2305,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = ceph_crc32c_page(crc, page, page_offset, ret);
- (void) ceph_msg_data_advance(cursor, (size_t)ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
}
if (do_datacrc)
con->in_data_crc = crc;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 29a0ef351c5e..250f11f78609 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -43,15 +43,13 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
int i, err = -EINVAL;
struct ceph_fsid fsid;
u32 epoch, num_mon;
- u16 version;
u32 len;
ceph_decode_32_safe(&p, end, len, bad);
ceph_decode_need(&p, end, len, bad);
dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-
- ceph_decode_16_safe(&p, end, version, bad);
+ p += sizeof(u16); /* skip version */
ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 242d7c0d92f8..924f07c36ddb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
+ req->r_abort_on_full = true;
req->r_flags = flags;
req->r_base_oloc.pool = layout->pool_id;
req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -1005,7 +1006,7 @@ static bool osd_registered(struct ceph_osd *osd)
*/
static void osd_init(struct ceph_osd *osd)
{
- atomic_set(&osd->o_ref, 1);
+ refcount_set(&osd->o_ref, 1);
RB_CLEAR_NODE(&osd->o_node);
osd->o_requests = RB_ROOT;
osd->o_linger_requests = RB_ROOT;
@@ -1050,9 +1051,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
static struct ceph_osd *get_osd(struct ceph_osd *osd)
{
- if (atomic_inc_not_zero(&osd->o_ref)) {
- dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
- atomic_read(&osd->o_ref));
+ if (refcount_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
+ refcount_read(&osd->o_ref));
return osd;
} else {
dout("get_osd %p FAIL\n", osd);
@@ -1062,9 +1063,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd)
static void put_osd(struct ceph_osd *osd)
{
- dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
- atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref)) {
+ dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
+ refcount_read(&osd->o_ref) - 1);
+ if (refcount_dec_and_test(&osd->o_ref)) {
osd_cleanup(osd);
kfree(osd);
}
@@ -1297,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
__pool_full(pi);
WARN_ON(pi->id != t->base_oloc.pool);
- return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+ ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+ (osdc->osdmap->epoch < osdc->epoch_barrier);
}
enum calc_target_result {
@@ -1503,9 +1505,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
ceph_encode_32(&p, req->r_flags);
ceph_encode_timespec(p, &req->r_mtime);
p += sizeof(struct ceph_timespec);
- /* aka reassert_version */
- memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
- p += sizeof(req->r_replay_version);
+
+ /* reassert_version */
+ memset(p, 0, sizeof(struct ceph_eversion));
+ p += sizeof(struct ceph_eversion);
/* oloc */
ceph_start_encoding(&p, 5, 4,
@@ -1626,6 +1629,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
ceph_monc_renew_subs(&osdc->client->monc);
}
+static void complete_request(struct ceph_osd_request *req, int err);
static void send_map_check(struct ceph_osd_request *req);
static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1635,6 +1639,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
enum calc_target_result ct_res;
bool need_send = false;
bool promoted = false;
+ bool need_abort = false;
WARN_ON(req->r_tid);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1650,8 +1655,13 @@ again:
goto promote;
}
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+ dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+ osdc->epoch_barrier);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
dout("req %p pausewr\n", req);
req->r_t.paused = true;
maybe_request_map(osdc);
@@ -1669,6 +1679,8 @@ again:
pr_warn_ratelimited("FULL or reached pool quota\n");
req->r_t.paused = true;
maybe_request_map(osdc);
+ if (req->r_abort_on_full)
+ need_abort = true;
} else if (!osd_homeless(osd)) {
need_send = true;
} else {
@@ -1685,6 +1697,8 @@ again:
link_request(osd, req);
if (need_send)
send_request(req);
+ else if (need_abort)
+ complete_request(req, -ENOSPC);
mutex_unlock(&osd->lock);
if (ct_res == CALC_TARGET_POOL_DNE)
@@ -1799,6 +1813,97 @@ static void abort_request(struct ceph_osd_request *req, int err)
complete_request(req, err);
}
+static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ if (likely(eb > osdc->epoch_barrier)) {
+ dout("updating epoch_barrier from %u to %u\n",
+ osdc->epoch_barrier, eb);
+ osdc->epoch_barrier = eb;
+ /* Request map if we're not to the barrier yet */
+ if (eb > osdc->osdmap->epoch)
+ maybe_request_map(osdc);
+ }
+}
+
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ down_read(&osdc->lock);
+ if (unlikely(eb > osdc->epoch_barrier)) {
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ update_epoch_barrier(osdc, eb);
+ up_write(&osdc->lock);
+ } else {
+ up_read(&osdc->lock);
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
+/*
+ * Drop all pending requests that are stalled waiting on a full condition to
+ * clear, and complete them with ENOSPC as the return code. Set the
+ * osdc->epoch_barrier to the latest map epoch that we've seen if any were
+ * cancelled.
+ */
+static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+ bool victims = false;
+
+ dout("enter abort_on_full\n");
+
+ if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc))
+ goto out;
+
+ /* Scan list and see if there is anything to abort */
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ struct rb_node *m;
+
+ m = rb_first(&osd->o_requests);
+ while (m) {
+ struct ceph_osd_request *req = rb_entry(m,
+ struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_abort_on_full) {
+ victims = true;
+ break;
+ }
+ }
+ if (victims)
+ break;
+ }
+
+ if (!victims)
+ goto out;
+
+ /*
+ * Update the barrier to current epoch if it's behind that point,
+ * since we know we have some calls to be aborted in the tree.
+ */
+ update_epoch_barrier(osdc, osdc->osdmap->epoch);
+
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ struct rb_node *m;
+
+ m = rb_first(&osd->o_requests);
+ while (m) {
+ struct ceph_osd_request *req = rb_entry(m,
+ struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_abort_on_full &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.target_oloc.pool)))
+ abort_request(req, -ENOSPC);
+ }
+ }
+out:
+ dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier);
+}
+
static void check_pool_dne(struct ceph_osd_request *req)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -3252,11 +3357,13 @@ done:
pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
have_pool_full(osdc);
- if (was_pauserd || was_pausewr || pauserd || pausewr)
+ if (was_pauserd || was_pausewr || pauserd || pausewr ||
+ osdc->osdmap->epoch < osdc->epoch_barrier)
maybe_request_map(osdc);
kick_requests(osdc, &need_resend, &need_resend_linger);
+ ceph_osdc_abort_on_full(osdc);
ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
osdc->osdmap->epoch);
up_write(&osdc->lock);
@@ -4126,7 +4233,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
close_osd(osd);
}
up_write(&osdc->lock);
- WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+ WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
osd_cleanup(&osdc->homeless_osd);
WARN_ON(!list_empty(&osdc->osd_lru));
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index ffe9e904d4d1..55e3a477f92d 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -317,6 +317,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
u32 yes;
struct crush_rule *r;
+ err = -EINVAL;
ceph_decode_32_safe(p, end, yes, bad);
if (!yes) {
dout("crush_decode NO rule %d off %x %p to %p\n",
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 6864007e64fc..ce09f73be759 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
void ceph_pagelist_release(struct ceph_pagelist *pl)
{
- if (!atomic_dec_and_test(&pl->refcnt))
+ if (!refcount_dec_and_test(&pl->refcnt))
return;
ceph_pagelist_unmap_tail(pl);
while (!list_empty(&pl->head)) {
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 705414e78ae0..e14a5d038656 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
if (!snapc)
return NULL;
- atomic_set(&snapc->nref, 1);
+ refcount_set(&snapc->nref, 1);
snapc->num_snaps = snap_count;
return snapc;
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context);
struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
{
if (sc)
- atomic_inc(&sc->nref);
+ refcount_inc(&sc->nref);
return sc;
}
EXPORT_SYMBOL(ceph_get_snap_context);
@@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc)
{
if (!sc)
return;
- if (atomic_dec_and_test(&sc->nref)) {
+ if (refcount_dec_and_test(&sc->nref)) {
/*printk(" deleting snap_context %p\n", sc);*/
kfree(sc);
}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index db1866f2ffcf..34678828e2bb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
-static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
void *key)
{
unsigned long bits = (unsigned long)key;
diff --git a/net/core/dev.c b/net/core/dev.c
index 96cf83da0d66..416137c64bf8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1253,8 +1253,9 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
if (!new_ifalias)
return -ENOMEM;
dev->ifalias = new_ifalias;
+ memcpy(dev->ifalias, alias, len);
+ dev->ifalias[len] = 0;
- strlcpy(dev->ifalias, alias, len+1);
return len;
}
@@ -4766,6 +4767,13 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
+static void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+}
+
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
@@ -4779,13 +4787,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
break;
case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
- skb_dst_drop(skb);
- secpath_reset(skb);
- kmem_cache_free(skbuff_head_cache, skb);
- } else {
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
__kfree_skb(skb);
- }
break;
case GRO_HELD:
@@ -4857,10 +4862,16 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
break;
case GRO_DROP:
- case GRO_MERGED_FREE:
napi_reuse_skb(napi, skb);
break;
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
case GRO_MERGED:
case GRO_CONSUMED:
break;
@@ -4948,6 +4959,19 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
+static void net_rps_send_ipi(struct softnet_data *remsd)
+{
+#ifdef CONFIG_RPS
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu, &remsd->csd);
+ remsd = next;
+ }
+#endif
+}
+
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
@@ -4963,14 +4987,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
- while (remsd) {
- struct softnet_data *next = remsd->rps_ipi_next;
-
- if (cpu_online(remsd->cpu))
- smp_call_function_single_async(remsd->cpu,
- &remsd->csd);
- remsd = next;
- }
+ net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
@@ -5199,8 +5216,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
if (rc == BUSY_POLL_BUDGET)
__napi_schedule(napi);
local_bh_enable();
- if (local_softirq_pending())
- do_softirq();
}
void napi_busy_loop(unsigned int napi_id,
@@ -6852,6 +6867,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+ struct netdev_xdp xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ /* Query must always succeed. */
+ WARN_ON(xdp_op(dev, &xdp) < 0);
+ return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+ struct netlink_ext_ack *extack,
+ struct bpf_prog *prog)
+{
+ struct netdev_xdp xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_SETUP_PROG;
+ xdp.extack = extack;
+ xdp.prog = prog;
+
+ return xdp_op(dev, &xdp);
+}
+
/**
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
@@ -6864,41 +6905,34 @@ EXPORT_SYMBOL(dev_change_proto_down);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags)
{
- int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp;
+ xdp_op_t xdp_op, xdp_chk;
int err;
ASSERT_RTNL();
- xdp_op = ops->ndo_xdp;
+ xdp_op = xdp_chk = ops->ndo_xdp;
+ if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
+ return -EOPNOTSUPP;
if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
xdp_op = generic_xdp_install;
+ if (xdp_op == xdp_chk)
+ xdp_chk = generic_xdp_install;
if (fd >= 0) {
- if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
-
- err = xdp_op(dev, &xdp);
- if (err < 0)
- return err;
- if (xdp.prog_attached)
- return -EBUSY;
- }
+ if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+ return -EEXIST;
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+ __dev_xdp_attached(dev, xdp_op))
+ return -EBUSY;
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_SETUP_PROG;
- xdp.extack = extack;
- xdp.prog = prog;
-
- err = xdp_op(dev, &xdp);
+ err = dev_xdp_install(dev, xdp_op, extack, prog);
if (err < 0 && prog)
bpf_prog_put(prog);
@@ -7482,6 +7516,8 @@ out:
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
@@ -7689,8 +7725,10 @@ void netdev_run_todo(void)
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
- if (dev->destructor)
- dev->destructor(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+ if (dev->needs_free_netdev)
+ free_netdev(dev);
/* Report a network device has been unregistered */
rtnl_lock();
@@ -7755,9 +7793,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
- storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
- storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
- storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
+ storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -8173,7 +8211,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
struct sk_buff **list_skb;
struct sk_buff *skb;
unsigned int cpu;
- struct softnet_data *sd, *oldsd;
+ struct softnet_data *sd, *oldsd, *remsd = NULL;
local_irq_disable();
cpu = smp_processor_id();
@@ -8214,6 +8252,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+#ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+#endif
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx_ni(skb);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b94b1d293506..27fad31784a8 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -410,6 +410,22 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCGIFNAME)
return dev_ifname(net, (struct ifreq __user *)arg);
+ /*
+ * Take care of Wireless Extensions. Unfortunately struct iwreq
+ * isn't a proper subset of struct ifreq (it's 8 byte shorter)
+ * so we need to treat it specially, otherwise applications may
+ * fault if the struct they're passing happens to land at the
+ * end of a mapped page.
+ */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+ struct iwreq iwr;
+
+ if (copy_from_user(&iwr, arg, sizeof(iwr)))
+ return -EFAULT;
+
+ return wext_handle_ioctl(net, &iwr, cmd, arg);
+ }
+
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
return -EFAULT;
@@ -559,9 +575,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ret = -EFAULT;
return ret;
}
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
- return wext_handle_ioctl(net, &ifr, cmd, arg);
return -ENOTTY;
}
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b0b87a292e7c..a0adfc31a3fe 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1680,8 +1680,10 @@ start_again:
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr)
+ if (!hdr) {
+ nlmsg_free(skb);
return -EMSGSIZE;
+ }
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
@@ -2098,8 +2100,10 @@ start_again:
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr)
+ if (!hdr) {
+ nlmsg_free(skb);
return -EMSGSIZE;
+ }
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
diff --git a/net/core/dst.c b/net/core/dst.c
index 960e503b5a52..13ba4a090c41 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(dst_discard_out);
-const u32 dst_default_metrics[RTAX_MAX + 1] = {
+const struct dst_metrics dst_default_metrics = {
/* This initializer is needed to force linker to place this variable
* into const section. Otherwise it might end into bss section.
* We really want to avoid false sharing on this variable, and catch
* any writes on it.
*/
- [RTAX_MAX] = 0xdeadbeef,
+ .refcnt = ATOMIC_INIT(1),
};
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
@@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
if (dev)
dev_hold(dev);
dst->ops = ops;
- dst_init_metrics(dst, dst_default_metrics, true);
+ dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
dst->path = dst;
dst->from = NULL;
@@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release);
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
- u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+ struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
if (p) {
- u32 *old_p = __DST_METRICS_PTR(old);
+ struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
unsigned long prev, new;
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+ atomic_set(&p->refcnt, 1);
+ memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
new = (unsigned long) p;
prev = cmpxchg(&dst->_metrics, old, new);
if (prev != old) {
kfree(p);
- p = __DST_METRICS_PTR(prev);
+ p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
if (prev & DST_METRICS_READ_ONLY)
p = NULL;
+ } else if (prev & DST_METRICS_REFCOUNTED) {
+ if (atomic_dec_and_test(&old_p->refcnt))
+ kfree(old_p);
}
}
- return p;
+ BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
+ return (u32 *)p;
}
EXPORT_SYMBOL(dst_cow_metrics_generic);
@@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
unsigned long prev, new;
- new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
+ new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
prev = cmpxchg(&dst->_metrics, old, new);
if (prev == old)
kfree(__DST_METRICS_PTR(old));
@@ -464,6 +469,20 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event,
spin_lock_bh(&dst_garbage.lock);
dst = dst_garbage.list;
dst_garbage.list = NULL;
+ /* The code in dst_ifdown places a hold on the loopback device.
+ * If the gc entry processing is set to expire after a lengthy
+ * interval, this hold can cause netdev_wait_allrefs() to hang
+ * out and wait for a long time -- until the the loopback
+ * interface is released. If we're really unlucky, it'll emit
+ * pr_emerg messages to console too. Reset the interval here,
+ * so dst cleanups occur in a more timely fashion.
+ */
+ if (dst_garbage.timer_inc > DST_GC_INC) {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ mod_delayed_work(system_wq, &dst_gc_work,
+ dst_garbage.timer_expires);
+ }
spin_unlock_bh(&dst_garbage.lock);
if (last)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f21c4d3aeae0..3bba291c6c32 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -568,7 +568,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *tmp;
+ struct fib_rule *rule, *r;
struct nlattr *tb[FRA_MAX+1];
struct fib_kuid_range range;
int err = -EINVAL;
@@ -668,16 +668,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
/*
* Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
* disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules have
- * actually been added.
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
*/
if (ops->nr_goto_rules > 0) {
- list_for_each_entry(tmp, &ops->rules_list, list) {
- if (rtnl_dereference(tmp->ctarget) == rule) {
- RCU_INIT_POINTER(tmp->ctarget, NULL);
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
ops->unresolved_rules++;
- }
}
}
diff --git a/net/core/filter.c b/net/core/filter.c
index a253a6197e6b..a6bb95fa87b2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2281,6 +2281,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
+ func == bpf_clone_redirect ||
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 58b0bcc125b5..d274f81fcc2c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
lladdr = neigh->ha;
}
- if (new & NUD_CONNECTED)
- neigh->confirmed = jiffies;
- neigh->updated = jiffies;
-
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
@@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
}
}
+ /* Update timestamps only once we know we will make a change to the
+ * neighbour entry. Otherwise we risk to move the locktime window with
+ * noop updates and ignore relevant ARP updates.
+ */
+ if (new != old || lladdr != neigh->ha) {
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+ neigh->updated = jiffies;
+ }
+
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1934efd4a9d4..26bbfababff2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -315,6 +315,25 @@ out_undo:
goto out;
}
+static int __net_init net_defaults_init_net(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ return 0;
+}
+
+static struct pernet_operations net_defaults_ops = {
+ .init = net_defaults_init_net,
+};
+
+static __init int net_defaults_init(void)
+{
+ if (register_pernet_subsys(&net_defaults_ops))
+ panic("Cannot initialize net default settings");
+
+ return 0;
+}
+
+core_initcall(net_defaults_init);
#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bcb0f610ee42..467a2f4510a7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -899,8 +899,7 @@ static size_t rtnl_port_size(const struct net_device *dev,
static size_t rtnl_xdp_size(void)
{
size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
- nla_total_size(1) + /* XDP_ATTACHED */
- nla_total_size(4); /* XDP_FLAGS */
+ nla_total_size(1); /* XDP_ATTACHED */
return xdp_size;
}
@@ -932,6 +931,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ + nla_total_size(4) /* IFLA_GROUP */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
@@ -1125,6 +1125,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
+ memset(&ivi, 0, sizeof(ivi));
+
/* Not all SR-IOV capable drivers support the
* spoofcheck and "RSS query enable" query. Preset to
* -1 so the user space tool can detect that the driver
@@ -1133,7 +1135,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
ivi.spoofchk = -1;
ivi.rss_query_en = -1;
ivi.trusted = -1;
- memset(ivi.mac, 0, sizeof(ivi.mac));
/* The default value for VF link state is "auto"
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
@@ -1247,37 +1248,34 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static u8 rtnl_xdp_attached_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ ASSERT_RTNL();
+
+ if (rcu_access_pointer(dev->xdp_prog))
+ return XDP_ATTACHED_SKB;
+ if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp))
+ return XDP_ATTACHED_DRV;
+
+ return XDP_ATTACHED_NONE;
+}
+
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
{
struct nlattr *xdp;
- u32 xdp_flags = 0;
- u8 val = 0;
int err;
xdp = nla_nest_start(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
- if (rcu_access_pointer(dev->xdp_prog)) {
- xdp_flags = XDP_FLAGS_SKB_MODE;
- val = 1;
- } else if (dev->netdev_ops->ndo_xdp) {
- struct netdev_xdp xdp_op = {};
-
- xdp_op.command = XDP_QUERY_PROG;
- err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
- if (err)
- goto err_cancel;
- val = xdp_op.prog_attached;
- }
- err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
+ rtnl_xdp_attached_mode(dev));
if (err)
goto err_cancel;
- if (xdp_flags) {
- err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
- if (err)
- goto err_cancel;
- }
nla_nest_end(skb, xdp);
return 0;
@@ -1471,6 +1469,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
[IFLA_PROTO_DOWN] = { .type = NLA_U8 },
[IFLA_XDP] = { .type = NLA_NESTED },
+ [IFLA_GROUP] = { .type = NLA_U32 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1631,13 +1630,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
cb->nlh->nlmsg_seq, 0,
flags,
ext_filter_mask);
- /* If we ran out of room on the first message,
- * we're in trouble
- */
- WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
- if (err < 0)
- goto out;
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
@@ -1645,10 +1644,12 @@ cont:
}
}
out:
+ err = skb->len;
+out_err:
cb->args[1] = idx;
cb->args[0] = h;
- return skb->len;
+ return err;
}
int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
@@ -2199,6 +2200,11 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+ if ((xdp_flags & XDP_FLAGS_SKB_MODE) &&
+ (xdp_flags & XDP_FLAGS_DRV_MODE)) {
+ err = -EINVAL;
+ goto errout;
+ }
}
if (xdp[IFLA_XDP_FD]) {
@@ -3228,8 +3234,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
int err = 0;
int fidx = 0;
- if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
- IFLA_MAX, ifla_policy, NULL) == 0) {
+ err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, NULL);
+ if (err < 0) {
+ return -EINVAL;
+ } else if (err == 0) {
if (tb[IFLA_MASTER])
br_idx = nla_get_u32(tb[IFLA_MASTER]);
}
@@ -3452,8 +3461,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
err = br_dev->netdev_ops->ndo_bridge_getlink(
skb, portid, seq, dev,
filter_mask, NLM_F_MULTI);
- if (err < 0 && err != -EOPNOTSUPP)
- break;
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
}
idx++;
}
@@ -3464,16 +3477,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
seq, dev,
filter_mask,
NLM_F_MULTI);
- if (err < 0 && err != -EOPNOTSUPP)
- break;
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
}
idx++;
}
}
+ err = skb->len;
+out_err:
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static inline size_t bridge_nlmsg_size(void)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 346d3e85dfbc..b1be7c01efe2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3754,8 +3754,11 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
- if (skb && (skb_next = skb_peek(q)))
+ if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
+ if (icmp_next)
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin;
+ }
spin_unlock_irqrestore(&q->lock, flags);
if (is_icmp_err_skb(skb) && !icmp_next)
diff --git a/net/core/sock.c b/net/core/sock.c
index 79c6aee6af9b..727f924b7f91 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,10 +139,7 @@
#include <trace/events/sock.h>
-#ifdef CONFIG_INET
#include <net/tcp.h>
-#endif
-
#include <net/busy_poll.h>
static DEFINE_MUTEX(proto_list_mutex);
@@ -1803,28 +1800,24 @@ EXPORT_SYMBOL(skb_set_owner_w);
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
* But we also want to keep skb->sk set because some packet schedulers
- * rely on it (sch_fq for example). So we set skb->truesize to a small
- * amount (1) and decrease sk_wmem_alloc accordingly.
+ * rely on it (sch_fq for example).
*/
void skb_orphan_partial(struct sk_buff *skb)
{
- /* If this skb is a TCP pure ACK or already went here,
- * we have nothing to do. 2 is already a very small truesize.
- */
- if (skb->truesize <= 2)
+ if (skb_is_tcp_pure_ack(skb))
return;
- /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
- * so we do not completely orphan skb, but transfert all
- * accounted bytes but one, to avoid unexpected reorders.
- */
if (skb->destructor == sock_wfree
#ifdef CONFIG_INET
|| skb->destructor == tcp_wfree
#endif
) {
- atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
- skb->truesize = 1;
+ struct sock *sk = skb->sk;
+
+ if (atomic_inc_not_zero(&sk->sk_refcnt)) {
+ atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ skb->destructor = sock_efree;
+ }
} else {
skb_orphan(skb);
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index ea23254b2457..b7cd9aafe99e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -479,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net)
{
struct ctl_table *tbl;
- net->core.sysctl_somaxconn = SOMAXCONN;
-
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b99168b0fabf..f75482bdee9a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.rsk_prot = &dccp_request_sock_ops,
.twsk_prot = &dccp_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e403e7..992621172220 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -426,6 +426,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
@@ -490,6 +493,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
newnp->opt = NULL;
newnp->mcast_oif = inet6_iif(skb);
@@ -1014,7 +1020,7 @@ static struct proto dccp_v6_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.rsk_prot = &dccp6_request_sock_ops,
.twsk_prot = &dccp6_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo,
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 4b9518a0d248..6f95612b4d32 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -188,12 +188,6 @@ static inline void dnrt_free(struct dn_route *rt)
call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
-static inline void dnrt_drop(struct dn_route *rt)
-{
- dst_release(&rt->dst);
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static void dn_dst_check_expire(unsigned long dummy)
{
int i;
@@ -248,7 +242,7 @@ static int dn_dst_gc(struct dst_ops *ops)
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
- dnrt_drop(rt);
+ dnrt_free(rt);
break;
}
spin_unlock_bh(&dn_rt_hash_table[i].lock);
@@ -350,7 +344,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
dst_use(&rth->dst, now);
spin_unlock_bh(&dn_rt_hash_table[hash].lock);
- dnrt_drop(rt);
+ dst_free(&rt->dst);
*rp = rth;
return 0;
}
@@ -380,7 +374,7 @@ static void dn_run_flush(unsigned long dummy)
for(; rt; rt = next) {
next = rcu_dereference_raw(rt->dst.dn_next);
RCU_INIT_POINTER(rt->dst.dn_next, NULL);
- dst_free((struct dst_entry *)rt);
+ dnrt_free(rt);
}
nothing_to_declare:
@@ -1187,7 +1181,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 1ed81ac6dd1a..aa8ffecc46a4 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -102,7 +102,9 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ if (skb->len < sizeof(*nlh) ||
+ nlh->nlmsg_len < sizeof(*nlh) ||
+ skb->len < nlh->nlmsg_len)
return;
if (!netlink_capable(skb, CAP_NET_ADMIN))
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 26130ae438da..90038d45a547 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -223,6 +223,53 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
return 0;
}
+#ifdef CONFIG_PM_SLEEP
+int dsa_switch_suspend(struct dsa_switch *ds)
+{
+ int i, ret = 0;
+
+ /* Suspend slave network devices */
+ for (i = 0; i < ds->num_ports; i++) {
+ if (!dsa_is_port_initialized(ds, i))
+ continue;
+
+ ret = dsa_slave_suspend(ds->ports[i].netdev);
+ if (ret)
+ return ret;
+ }
+
+ if (ds->ops->suspend)
+ ret = ds->ops->suspend(ds);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_suspend);
+
+int dsa_switch_resume(struct dsa_switch *ds)
+{
+ int i, ret = 0;
+
+ if (ds->ops->resume)
+ ret = ds->ops->resume(ds);
+
+ if (ret)
+ return ret;
+
+ /* Resume slave network devices */
+ for (i = 0; i < ds->num_ports; i++) {
+ if (!dsa_is_port_initialized(ds, i))
+ continue;
+
+ ret = dsa_slave_resume(ds->ports[i].netdev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_resume);
+#endif
+
static struct packet_type dsa_pack_type __read_mostly = {
.type = cpu_to_be16(ETH_P_XDSA),
.func = dsa_switch_rcv,
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 033b3bfb63dc..7796580e99ee 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -484,8 +484,10 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
dsa_ds_unapply(dst, ds);
}
- if (dst->cpu_switch)
+ if (dst->cpu_switch) {
dsa_cpu_port_ethtool_restore(dst->cpu_switch);
+ dst->cpu_switch = NULL;
+ }
pr_info("DSA: tree %d unapplied\n", dst->tree);
dst->applied = false;
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index ad345c8b0b06..7281098df04e 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -289,53 +289,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
dsa_switch_unregister_notifier(ds);
}
-#ifdef CONFIG_PM_SLEEP
-int dsa_switch_suspend(struct dsa_switch *ds)
-{
- int i, ret = 0;
-
- /* Suspend slave network devices */
- for (i = 0; i < ds->num_ports; i++) {
- if (!dsa_is_port_initialized(ds, i))
- continue;
-
- ret = dsa_slave_suspend(ds->ports[i].netdev);
- if (ret)
- return ret;
- }
-
- if (ds->ops->suspend)
- ret = ds->ops->suspend(ds);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(dsa_switch_suspend);
-
-int dsa_switch_resume(struct dsa_switch *ds)
-{
- int i, ret = 0;
-
- if (ds->ops->resume)
- ret = ds->ops->resume(ds);
-
- if (ret)
- return ret;
-
- /* Resume slave network devices */
- for (i = 0; i < ds->num_ports; i++) {
- if (!dsa_is_port_initialized(ds, i))
- continue;
-
- ret = dsa_slave_resume(ds->ports[i].netdev);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(dsa_switch_resume);
-#endif
-
/* platform driver init and cleanup *****************************************/
static int dev_is_class(struct device *dev, void *class)
{
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index c73160fb11e7..0a0a392dc2bd 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -378,7 +378,6 @@ static void hsr_dev_destroy(struct net_device *hsr_dev)
del_timer_sync(&hsr->announce_timer);
synchronize_rcu();
- free_netdev(hsr_dev);
}
static const struct net_device_ops hsr_device_ops = {
@@ -404,7 +403,8 @@ void hsr_dev_setup(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &hsr_type);
dev->priv_flags |= IFF_NO_QUEUE;
- dev->destructor = hsr_dev_destroy;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = hsr_dev_destroy;
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 4ebe2aa3e7d3..04b5450c5a55 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -324,8 +324,7 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame,
unsigned long irqflags;
frame->is_supervision = is_supervision_frame(port->hsr, skb);
- frame->node_src = hsr_get_node(&port->hsr->node_db, skb,
- frame->is_supervision);
+ frame->node_src = hsr_get_node(port, skb, frame->is_supervision);
if (frame->node_src == NULL)
return -1; /* Unknown node and !is_supervision, or no mem */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 7ea925816f79..284a9b820df8 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -158,9 +158,10 @@ struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
/* Get the hsr_node from which 'skb' was sent.
*/
-struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
bool is_sup)
{
+ struct list_head *node_db = &port->hsr->node_db;
struct hsr_node *node;
struct ethhdr *ethhdr;
u16 seq_out;
@@ -186,7 +187,11 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
*/
seq_out = hsr_get_skb_sequence_nr(skb) - 1;
} else {
- WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
+ /* this is called also for frames from master port and
+ * so warn only for non master ports
+ */
+ if (port->type != HSR_PT_MASTER)
+ WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
seq_out = HSR_SEQNR_START;
}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 438b40f98f5a..4e04f0e868e9 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -18,7 +18,7 @@ struct hsr_node;
struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
u16 seq_out);
-struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
bool is_sup);
void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
struct hsr_port *port);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index d7efbf0dad20..0a866f332290 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -107,7 +107,7 @@ static void lowpan_setup(struct net_device *ldev)
ldev->netdev_ops = &lowpan_netdev_ops;
ldev->header_ops = &lowpan_header_ops;
- ldev->destructor = free_netdev;
+ ldev->needs_free_netdev = true;
ldev->features |= NETIF_F_NETNS_LOCAL;
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f3dad1661343..58925b6597de 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1043,7 +1043,7 @@ static struct inet_protosw inetsw_array[] =
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
- .ops = &inet_dgram_ops,
+ .ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0937b34c27ca..e9f3386a528b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -641,6 +641,32 @@ void arp_xmit(struct sk_buff *skb)
}
EXPORT_SYMBOL(arp_xmit);
+static bool arp_is_garp(struct net *net, struct net_device *dev,
+ int *addr_type, __be16 ar_op,
+ __be32 sip, __be32 tip,
+ unsigned char *sha, unsigned char *tha)
+{
+ bool is_garp = tip == sip;
+
+ /* Gratuitous ARP _replies_ also require target hwaddr to be
+ * the same as source.
+ */
+ if (is_garp && ar_op == htons(ARPOP_REPLY))
+ is_garp =
+ /* IPv4 over IEEE 1394 doesn't provide target
+ * hardware address field in its ARP payload.
+ */
+ tha &&
+ !memcmp(tha, sha, dev->addr_len);
+
+ if (is_garp) {
+ *addr_type = inet_addr_type_dev_table(net, dev, sip);
+ if (*addr_type != RTN_UNICAST)
+ is_garp = false;
+ }
+ return is_garp;
+}
+
/*
* Process an arp request.
*/
@@ -653,6 +679,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha;
+ unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
@@ -724,6 +751,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
break;
#endif
default:
+ tha = arp_ptr;
arp_ptr += dev->addr_len;
}
memcpy(&tip, arp_ptr, 4);
@@ -835,19 +863,25 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (IN_DEV_ARP_ACCEPT(in_dev)) {
- unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);
+ addr_type = -1;
+ if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
+ is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
+ sip, tip, sha, tha);
+ }
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
- is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
- addr_type == RTN_UNICAST;
-
if (!n &&
- ((arp->ar_op == htons(ARPOP_REPLY) &&
- addr_type == RTN_UNICAST) || is_garp))
+ (is_garp ||
+ (arp->ar_op == htons(ARPOP_REPLY) &&
+ (addr_type == RTN_UNICAST ||
+ (addr_type < 0 &&
+ /* postpone calculation to as late as possible */
+ inet_addr_type_dev_table(net, dev, sip) ==
+ RTN_UNICAST)))))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 65cc02bd82bc..93322f895eab 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
u8 *tail;
u8 *vaddr;
int nfrags;
+ int esph_offset;
struct page *page;
struct sk_buff *trailer;
int tailen = esp->tailen;
@@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
}
cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
nfrags = skb_cow_data(skb, tailen, &trailer);
if (nfrags < 0)
goto out;
tail = skb_tail_pointer(trailer);
- esp->esph = ip_esp_hdr(skb);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
skip_cow:
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 39bd1edee676..83e3ed258467 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -763,7 +763,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int e = 0, s_e;
struct fib_table *tb;
struct hlist_head *head;
- int dumped = 0;
+ int dumped = 0, err;
if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
@@ -783,20 +783,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- if (fib_table_dump(tb, skb, cb) < 0)
- goto out;
+ err = fib_table_dump(tb, skb, cb);
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
dumped = 1;
next:
e++;
}
}
out:
+ err = skb->len;
+out_err:
rcu_read_unlock();
cb->args[1] = e;
cb->args[0] = h;
- return skb->len;
+ return err;
}
/* Prepare and feed intra-kernel routing request.
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da449ddb8cc1..ad9ad4aab5da 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ struct dst_metrics *m;
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
@@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
- if (fi->fib_metrics != (u32 *) dst_default_metrics)
- kfree(fi->fib_metrics);
+ m = fi->fib_metrics;
+ if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
+ kfree(m);
kfree(fi);
}
@@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
val = 255;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
return -EINVAL;
- fi->fib_metrics[type - 1] = val;
+ fi->fib_metrics->metrics[type - 1] = val;
}
if (ecn_ca)
- fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+ fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
return 0;
}
@@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
fib_info_cnt++;
if (cfg->fc_mx) {
- fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
if (!fi->fib_metrics)
goto failure;
+ atomic_set(&fi->fib_metrics->refcnt, 1);
} else
- fi->fib_metrics = (u32 *) dst_default_metrics;
+ fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
@@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
if (fi->fib_priority &&
nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
goto nla_put_failure;
- if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
goto nla_put_failure;
if (fi->fib_prefsrc &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1201409ba1dc..51182ff2b441 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1983,6 +1983,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
/* rcu_read_lock is hold by caller */
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ int err;
+
if (i < s_i) {
i++;
continue;
@@ -1993,17 +1995,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
continue;
}
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- xkey,
- KEYLENGTH - fa->fa_slen,
- fa->fa_tos,
- fa->fa_info, NLM_F_MULTI) < 0) {
+ err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ tb->tb_id, fa->fa_type,
+ xkey, KEYLENGTH - fa->fa_slen,
+ fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+ if (err < 0) {
cb->args[4] = i;
- return -1;
+ return err;
}
i++;
}
@@ -2025,10 +2024,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
t_key key = cb->args[3];
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
- if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ int err;
+
+ err = fn_trie_dump_leaf(l, tb, skb, cb);
+ if (err < 0) {
cb->args[3] = key;
cb->args[2] = count;
- return -1;
+ return err;
}
++count;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 43318b5f5647..9144fa7df2ad 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -657,8 +657,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
/* Needed by both icmp_global_allow and icmp_xmit_lock */
local_bh_disable();
- /* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!icmpv4_global_allow(net, type, code))
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
+ * incoming dev is loopback. If outgoing dev change to not be
+ * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
+ */
+ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
+ !icmpv4_global_allow(net, type, code))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 44fd86de2823..ec9a396fa466 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1112,6 +1112,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
+ spin_lock_init(&pmc->lock);
spin_lock_bh(&im->lock);
pmc->interface = im->interface;
in_dev_hold(in_dev);
@@ -2071,21 +2072,26 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
- struct ip_sf_list *psf, *nextpsf;
+ struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
- for (psf = pmc->tomb; psf; psf = nextpsf) {
+ spin_lock_bh(&pmc->lock);
+ tomb = pmc->tomb;
+ pmc->tomb = NULL;
+ sources = pmc->sources;
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+ spin_unlock_bh(&pmc->lock);
+
+ for (psf = tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
- pmc->tomb = NULL;
- for (psf = pmc->sources; psf; psf = nextpsf) {
+ for (psf = sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
- pmc->sources = NULL;
- pmc->sfmode = MCAST_EXCLUDE;
- pmc->sfcount[MCAST_INCLUDE] = 0;
- pmc->sfcount[MCAST_EXCLUDE] = 1;
}
/* Join a multicast group
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7a3fd25e8913..532b36e9ce2a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -964,7 +964,8 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
+ if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) ||
+ (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index b878ecbc0608..129d1a3616f8 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -446,6 +446,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
return 0;
drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
kfree_skb(skb);
return 0;
}
@@ -967,7 +969,6 @@ static void ip_tunnel_dev_free(struct net_device *dev)
gro_cells_destroy(&tunnel->gro_cells);
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
@@ -1155,7 +1156,8 @@ int ip_tunnel_init(struct net_device *dev)
struct iphdr *iph = &tunnel->parms.iph;
int err;
- dev->destructor = ip_tunnel_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip_tunnel_dev_free;
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a02d52ed50e..8ae425cad818 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -101,8 +101,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id);
static void ipmr_free_table(struct mr_table *mrt);
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local);
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
@@ -501,7 +501,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features |= NETIF_F_NETNS_LOCAL;
}
@@ -988,7 +988,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else {
- ip_mr_forward(net, mrt, skb, c, 0);
+ ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
}
}
}
@@ -1073,7 +1073,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Queue a packet for resolution. It gets locked cache entry! */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
{
const struct iphdr *iph = ip_hdr(skb);
struct mfc_cache *c;
@@ -1130,6 +1130,10 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -1828,10 +1832,10 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
/* "local" means that we should preserve one skb (for local delivery) */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local)
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local)
{
- int true_vifi = ipmr_find_vif(mrt, skb->dev);
+ int true_vifi = ipmr_find_vif(mrt, dev);
int psend = -1;
int vif, ct;
@@ -1853,13 +1857,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
/* Wrong interface: drop packet and (maybe) send PIM assert. */
- if (mrt->vif_table[vif].dev != skb->dev) {
- struct net_device *mdev;
-
- mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
- if (mdev == skb->dev)
- goto forward;
-
+ if (mrt->vif_table[vif].dev != dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1980,6 +1978,20 @@ int ip_mr_input(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
struct mr_table *mrt;
+ struct net_device *dev;
+
+ /* skb->dev passed in is the loX master dev for vrfs.
+ * As there are no vifs associated with loopback devices,
+ * get the proper interface that does have a vif associated with it.
+ */
+ dev = skb->dev;
+ if (netif_is_l3_master(skb->dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
/* Packet is looped back after forward, it should not be
* forwarded second time, but still can be delivered locally.
@@ -2017,7 +2029,7 @@ int ip_mr_input(struct sk_buff *skb)
/* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
if (!cache) {
- int vif = ipmr_find_vif(mrt, skb->dev);
+ int vif = ipmr_find_vif(mrt, dev);
if (vif >= 0)
cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
@@ -2037,9 +2049,9 @@ int ip_mr_input(struct sk_buff *skb)
}
read_lock(&mrt_lock);
- vif = ipmr_find_vif(mrt, skb->dev);
+ vif = ipmr_find_vif(mrt, dev);
if (vif >= 0) {
- int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+ int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
read_unlock(&mrt_lock);
return err2;
@@ -2050,7 +2062,7 @@ int ip_mr_input(struct sk_buff *skb)
}
read_lock(&mrt_lock);
- ip_mr_forward(net, mrt, skb, cache, local);
+ ip_mr_forward(net, mrt, dev, skb, cache, local);
read_unlock(&mrt_lock);
if (local)
@@ -2224,7 +2236,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
iph->saddr = saddr;
iph->daddr = daddr;
iph->version = 0;
- err = ipmr_cache_unresolved(mrt, vif, skb2);
+ err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 655d9eebe43e..6883b3d4ba8f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt)
static void ipv4_dst_destroy(struct dst_entry *dst)
{
+ struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *) dst;
+ if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+ kfree(p);
+
if (!list_empty(&rt->rt_uncached)) {
struct uncached_list *ul = rt->rt_uncached_list;
@@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
+ if (fi->fib_metrics != &dst_default_metrics) {
+ rt->dst._metrics |= DST_METRICS_REFCOUNTED;
+ atomic_inc(&fi->fib_metrics->refcnt);
+ }
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1e4c76d2b827..40aca7803cf2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1084,9 +1084,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
+ uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
if (tp->fastopen_req)
return -EALREADY; /* Another Fast Open is in progress */
@@ -1108,7 +1111,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
}
}
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
- err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ err = __inet_stream_connect(sk->sk_socket, uaddr,
msg->msg_namelen, flags, 1);
/* fastopen_req could already be freed in __inet_stream_connect
* if the connection times out or gets rst
@@ -2320,9 +2323,15 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
+ /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+ * issue in __tcp_select_window()
+ */
+ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
+ dst_release(sk->sk_rx_dst);
+ sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
/* Clean up fastopen related fields */
@@ -2374,9 +2383,10 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
return 0;
}
-static int tcp_repair_options_est(struct tcp_sock *tp,
+static int tcp_repair_options_est(struct sock *sk,
struct tcp_repair_opt __user *optbuf, unsigned int len)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct tcp_repair_opt opt;
while (len >= sizeof(opt)) {
@@ -2389,6 +2399,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
switch (opt.opt_code) {
case TCPOPT_MSS:
tp->rx_opt.mss_clamp = opt.opt_val;
+ tcp_mtup_init(sk);
break;
case TCPOPT_WINDOW:
{
@@ -2548,7 +2559,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (!tp->repair)
err = -EINVAL;
else if (sk->sk_state == TCP_ESTABLISHED)
- err = tcp_repair_options_est(tp,
+ err = tcp_repair_options_est(sk,
(struct tcp_repair_opt __user *)optval,
optlen);
else
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6e3c512054a6..324c9bcc5456 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -180,6 +180,7 @@ void tcp_init_congestion_control(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_sk(sk)->prior_ssthresh = 0;
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5a3ad09e2786..174d4376baa5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1179,13 +1179,14 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
*/
if (pkt_len > mss) {
unsigned int new_len = (pkt_len / mss) * mss;
- if (!in_sack && new_len < pkt_len) {
+ if (!in_sack && new_len < pkt_len)
new_len += mss;
- if (new_len >= skb->len)
- return 0;
- }
pkt_len = new_len;
}
+
+ if (pkt_len >= skb->len && !in_sack)
+ return 0;
+
err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
@@ -3189,7 +3190,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
int delta;
/* Non-retransmitted hole got filled? That's reordering */
- if (reord < prior_fackets)
+ if (reord < prior_fackets && reord <= tp->fackets_out)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a51582bef55..5ab2aac5ca19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2395,7 +2395,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ea6e4cff9faf..1d6219bf2d6b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1612,7 +1612,7 @@ static void udp_v4_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -1657,7 +1657,7 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index feb50a16398d..a8cf8c6fb60c 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
-int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8d297a79b568..1d2dbace42ff 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -332,9 +332,9 @@ static void addrconf_mod_rs_timer(struct inet6_dev *idev,
static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
unsigned long delay)
{
- if (!delayed_work_pending(&ifp->dad_work))
- in6_ifa_hold(ifp);
- mod_delayed_work(addrconf_wq, &ifp->dad_work, delay);
+ in6_ifa_hold(ifp);
+ if (mod_delayed_work(addrconf_wq, &ifp->dad_work, delay))
+ in6_ifa_put(ifp);
}
static int snmp6_alloc_dev(struct inet6_dev *idev)
@@ -1022,7 +1022,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
INIT_HLIST_NODE(&ifa->addr_lst);
ifa->scope = scope;
ifa->prefix_len = pfxlen;
- ifa->flags = flags | IFA_F_TENTATIVE;
+ ifa->flags = flags;
+ /* No need to add the TENTATIVE flag for addresses with NODAD */
+ if (!(flags & IFA_F_NODAD))
+ ifa->flags |= IFA_F_TENTATIVE;
ifa->valid_lft = valid_lft;
ifa->prefered_lft = prefered_lft;
ifa->cstamp = ifa->tstamp = jiffies;
@@ -3366,6 +3369,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct netdev_notifier_changeupper_info *info;
struct inet6_dev *idev = __in6_dev_get(dev);
+ struct net *net = dev_net(dev);
int run_pending = 0;
int err;
@@ -3381,7 +3385,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
case NETDEV_CHANGEMTU:
/* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
if (dev->mtu < IPV6_MIN_MTU) {
- addrconf_ifdown(dev, 1);
+ addrconf_ifdown(dev, dev != net->loopback_dev);
break;
}
@@ -3497,7 +3501,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
* IPV6_MIN_MTU stop IPv6 on this interface.
*/
if (dev->mtu < IPV6_MIN_MTU)
- addrconf_ifdown(dev, 1);
+ addrconf_ifdown(dev, dev != net->loopback_dev);
}
break;
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 37ac9de713c6..8d772fea1dde 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -1319,7 +1319,7 @@ static int calipso_skbuff_setattr(struct sk_buff *skb,
struct ipv6hdr *ip6_hdr;
struct ipv6_opt_hdr *hop;
unsigned char buf[CALIPSO_MAX_BUFFER];
- int len_delta, new_end, pad;
+ int len_delta, new_end, pad, payload;
unsigned int start, end;
ip6_hdr = ipv6_hdr(skb);
@@ -1346,6 +1346,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb,
if (ret_val < 0)
return ret_val;
+ ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */
+
if (len_delta) {
if (len_delta > 0)
skb_push(skb, len_delta);
@@ -1355,6 +1357,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb,
sizeof(*ip6_hdr) + start);
skb_reset_network_header(skb);
ip6_hdr = ipv6_hdr(skb);
+ payload = ntohs(ip6_hdr->payload_len);
+ ip6_hdr->payload_len = htons(payload + len_delta);
}
hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index e011122ebd43..5c786f5ab961 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -250,8 +250,14 @@ ipv4_connected:
*/
err = ip6_datagram_dst_update(sk, true);
- if (err)
+ if (err) {
+ /* Reset daddr and dport so that udp_v6_early_demux()
+ * fails to find this socket
+ */
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+ inet->inet_dport = 0;
goto out;
+ }
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index d950d43ba255..f02f131f6435 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -30,6 +30,25 @@
#include <net/ipv6.h>
#include <linux/icmpv6.h>
+static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
+{
+ int off = sizeof(struct ipv6hdr);
+ struct ipv6_opt_hdr *exthdr;
+
+ if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP))
+ return offsetof(struct ipv6hdr, nexthdr);
+
+ while (off < nhlen) {
+ exthdr = (void *)ipv6_hdr + off;
+ if (exthdr->nexthdr == NEXTHDR_ESP)
+ return off;
+
+ off += ipv6_optlen(exthdr);
+ }
+
+ return 0;
+}
+
static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
@@ -38,6 +57,7 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
struct xfrm_state *x;
__be32 seq;
__be32 spi;
+ int nhoff;
int err;
skb_pull(skb, offset);
@@ -72,6 +92,11 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
xo->flags |= XFRM_GRO;
+ nhoff = esp6_nexthdr_esp_offset(ipv6_hdr(skb), offset);
+ if (!nhoff)
+ goto out;
+
+ IP6CB(skb)->nhoff = nhoff;
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index eea23b57c6a5..ec849d88a662 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -32,7 +32,6 @@ struct fib6_rule {
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
- struct rt6_info *rt;
struct fib_lookup_arg arg = {
.lookup_ptr = lookup,
.flags = FIB_LOOKUP_NOREF,
@@ -44,21 +43,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- rt = arg.result;
+ if (arg.result)
+ return arg.result;
- if (!rt) {
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return &net->ipv6.ip6_null_entry->dst;
- }
-
- if (rt->rt6i_flags & RTF_REJECT &&
- rt->dst.error == -EAGAIN) {
- ip6_rt_put(rt);
- rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
- }
-
- return &rt->dst;
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return &net->ipv6.ip6_null_entry->dst;
}
static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
@@ -121,7 +110,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
flp6->saddr = saddr;
}
err = rt->dst.error;
- goto out;
+ if (err != -EAGAIN)
+ goto out;
}
again:
ip6_rt_put(rt);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 230b5aac9f03..8d7b113958b1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -491,7 +491,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!icmpv6_global_allow(type))
+ if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type))
goto out_bh_enable;
mip6_addr_swap(skb);
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 2fd5ca151dcf..77f7f8c7d93d 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -62,6 +62,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc)
{
u32 *v = (u32 *)loc.v32;
+ __ila_hash_secret_init();
return jhash_2words(v[0], v[1], hashrnd);
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d4bf2c68a545..e6b78ba0e636 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -289,8 +289,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
struct rt6_info *rt;
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
- if (rt->rt6i_flags & RTF_REJECT &&
- rt->dst.error == -EAGAIN) {
+ if (rt->dst.error == -EAGAIN) {
ip6_rt_put(rt);
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 8d128ba79b66..64eea3962733 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -537,11 +537,10 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- dsfield = ipv4_get_dsfield(iph);
-
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
- & IPV6_TCLASS_MASK;
+ dsfield = ipv4_get_dsfield(iph);
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
else
@@ -598,9 +597,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+ dsfield = ipv6_get_dsfield(ipv6h);
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
@@ -990,13 +991,13 @@ static void ip6gre_dev_free(struct net_device *dev)
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
static void ip6gre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6gre_netdev_ops;
- dev->destructor = ip6gre_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
dev->type = ARPHRD_IP6GRE;
@@ -1147,7 +1148,7 @@ static int __net_init ip6gre_init_net(struct net *net)
return 0;
err_reg_dev:
- ip6gre_dev_free(ign->fb_tunnel_dev);
+ free_netdev(ign->fb_tunnel_dev);
err_alloc_dev:
return err;
}
@@ -1299,7 +1300,8 @@ static void ip6gre_tap_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &ip6gre_tap_netdev_ops;
- dev->destructor = ip6gre_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 93e58a5e1837..cdb3728faca7 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -63,7 +63,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
const struct net_offload *ops;
int proto;
struct frag_hdr *fptr;
- unsigned int unfrag_ip6hlen;
unsigned int payload_len;
u8 *prevhdr;
int offset = 0;
@@ -116,8 +115,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
skb->network_header = (u8 *)ipv6h - skb->head;
if (udpfrag) {
- unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
- fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen);
+ int err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0) {
+ kfree_skb_list(segs);
+ return ERR_PTR(err);
+ }
+ fptr = (struct frag_hdr *)((u8 *)ipv6h + err);
fptr->frag_off = htons(offset);
if (skb->next)
fptr->frag_off |= htons(IP6_MF);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 58f6288e9ba5..1699acb2fa2c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -597,7 +597,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
- hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto fail;
+ hlen = err;
nexthdr = *prevhdr;
mtu = ip6_skb_dst_mtu(skb);
@@ -1387,7 +1390,7 @@ emsgsize:
*/
cork->length += length;
- if ((((length + fragheaderlen) > mtu) ||
+ if ((((length + (skb ? skb->len : headersize)) > mtu) ||
(skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
@@ -1463,6 +1466,11 @@ alloc_new_skb:
*/
alloclen += sizeof(struct frag_hdr);
+ copy = datalen - transhdrlen - fraggap;
+ if (copy < 0) {
+ err = -EINVAL;
+ goto error;
+ }
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len,
@@ -1512,13 +1520,9 @@ alloc_new_skb:
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
-
- if (copy < 0) {
- err = -EINVAL;
- kfree_skb(skb);
- goto error;
- } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ if (copy > 0 &&
+ getfrag(from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 6eb2ae507500..8c6c3c8e7eef 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -254,7 +254,6 @@ static void ip6_dev_free(struct net_device *dev)
gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
static int ip6_tnl_create2(struct net_device *dev)
@@ -322,7 +321,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
return t;
failed_free:
- ip6_dev_free(dev);
+ free_netdev(dev);
failed:
return ERR_PTR(err);
}
@@ -859,6 +858,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
return 0;
drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
kfree_skb(skb);
return 0;
}
@@ -1095,6 +1096,9 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
if (!dst) {
route_lookup:
+ /* add dsfield to flowlabel for route lookup */
+ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel);
+
dst = ip6_route_output(net, NULL, fl6);
if (dst->error)
@@ -1196,7 +1200,7 @@ route_lookup:
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
- ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+ ip6_flow_hdr(ipv6h, dsfield,
ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = hop_limit;
ipv6h->nexthdr = proto;
@@ -1231,8 +1235,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (tproto != IPPROTO_IPIP && tproto != 0)
return -1;
- dsfield = ipv4_get_dsfield(iph);
-
if (t->parms.collect_md) {
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
@@ -1246,6 +1248,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_proto = IPPROTO_IPIP;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
+ dsfield = key->tos;
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
@@ -1254,8 +1257,9 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_proto = IPPROTO_IPIP;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
- & IPV6_TCLASS_MASK;
+ dsfield = ipv4_get_dsfield(iph);
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
else
@@ -1267,6 +1271,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
+ dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph));
+
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
@@ -1300,8 +1306,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
ip6_tnl_addr_conflict(t, ipv6h))
return -1;
- dsfield = ipv6_get_dsfield(ipv6h);
-
if (t->parms.collect_md) {
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
@@ -1315,6 +1319,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_proto = IPPROTO_IPV6;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
+ dsfield = key->tos;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
@@ -1337,7 +1342,9 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_proto = IPPROTO_IPV6;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+ dsfield = ipv6_get_dsfield(ipv6h);
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
@@ -1351,6 +1358,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
+ dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h));
+
skb_set_inner_ipproto(skb, IPPROTO_IPV6);
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
@@ -1769,7 +1778,8 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6_tnl_netdev_ops;
- dev->destructor = ip6_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
dev->flags |= IFF_NOARP;
@@ -2216,7 +2226,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
return 0;
err_register:
- ip6_dev_free(ip6n->fb_tnl_dev);
+ free_netdev(ip6n->fb_tnl_dev);
err_alloc_dev:
return err;
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d67ef56454b2..837ea1eefe7f 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -180,7 +180,6 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t)
static void vti6_dev_free(struct net_device *dev)
{
free_percpu(dev->tstats);
- free_netdev(dev);
}
static int vti6_tnl_create2(struct net_device *dev)
@@ -235,7 +234,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p
return t;
failed_free:
- vti6_dev_free(dev);
+ free_netdev(dev);
failed:
return NULL;
}
@@ -842,7 +841,8 @@ static const struct net_device_ops vti6_netdev_ops = {
static void vti6_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &vti6_netdev_ops;
- dev->destructor = vti6_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = vti6_dev_free;
dev->type = ARPHRD_TUNNEL6;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
@@ -1100,7 +1100,7 @@ static int __net_init vti6_init_net(struct net *net)
return 0;
err_register:
- vti6_dev_free(ip6n->fb_tnl_dev);
+ free_netdev(ip6n->fb_tnl_dev);
err_alloc_dev:
return err;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 374997d26488..2ecb39b943b5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -733,7 +733,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8;
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features |= NETIF_F_NETNS_LOCAL;
}
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index cd4252346a32..e9065b8d3af8 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -79,14 +79,13 @@ EXPORT_SYMBOL(ipv6_select_ident);
int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
{
u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr =
- (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
unsigned int packet_len = skb_tail_pointer(skb) -
skb_network_header(skb);
int found_rhdr = 0;
*nexthdr = &ipv6_hdr(skb)->nexthdr;
- while (offset + 1 <= packet_len) {
+ while (offset <= packet_len) {
+ struct ipv6_opt_hdr *exthdr;
switch (**nexthdr) {
@@ -107,13 +106,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
return offset;
}
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
+ if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
+ return -EINVAL;
+
exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
offset);
+ offset += ipv6_optlen(exthdr);
+ *nexthdr = &exthdr->nexthdr;
}
- return offset;
+ return -EINVAL;
}
EXPORT_SYMBOL(ip6_find_1stfragopt);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 9b522fa90e6d..ac826dd338ff 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -192,7 +192,7 @@ static struct inet_protosw pingv6_protosw = {
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMPV6,
.prot = &pingv6_prot,
- .ops = &inet6_dgram_ops,
+ .ops = &inet6_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
};
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index cc8e3ae9ca73..e88bcb8ff0fd 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -219,7 +219,7 @@ static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
u64 buff64[SNMP_MIB_MAX];
int i;
- memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+ memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
for (i = 0; itemlist[i].name; i++)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 1f992d9e261d..60be012fe708 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1338,7 +1338,7 @@ void raw6_proc_exit(void)
#endif /* CONFIG_PROC_FS */
/* Same as inet6_dgram_ops, sans udp_poll. */
-static const struct proto_ops inet6_sockraw_ops = {
+const struct proto_ops inet6_sockraw_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
.release = inet6_release,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dc61b0b5e64e..322bd62e688b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2804,6 +2804,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
if ((rt->dst.dev == dev || !dev) &&
rt != adn->net->ipv6.ip6_null_entry &&
(rt->rt6i_nsiblings == 0 ||
+ (dev && netdev_unregistering(dev)) ||
!rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
return -1;
@@ -3721,7 +3722,11 @@ static int ip6_route_dev_notify(struct notifier_block *this,
net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
- } else if (event == NETDEV_UNREGISTER) {
+ } else if (event == NETDEV_UNREGISTER &&
+ dev->reg_state != NETREG_UNREGISTERED) {
+ /* NETDEV_UNREGISTER could be fired for multiple times by
+ * netdev_wait_allrefs(). Make sure we only call this once.
+ */
in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 61e5902f0687..f8ad15891cd7 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -265,7 +265,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
return nt;
failed_free:
- ipip6_dev_free(dev);
+ free_netdev(dev);
failed:
return NULL;
}
@@ -305,7 +305,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
* we try harder to allocate.
*/
kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
- kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
+ kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) :
NULL;
rcu_read_lock();
@@ -1336,7 +1336,6 @@ static void ipip6_dev_free(struct net_device *dev)
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
#define SIT_FEATURES (NETIF_F_SG | \
@@ -1351,7 +1350,8 @@ static void ipip6_tunnel_setup(struct net_device *dev)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->netdev_ops = &ipip6_netdev_ops;
- dev->destructor = ipip6_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497b5bb7..4f4310a36a04 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
@@ -1131,6 +1132,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
First: no IPv4 options.
*/
newinet->inet_opt = NULL;
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
@@ -1917,7 +1919,7 @@ struct proto tcpv6_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 04862abfe4ec..75703fda23e7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -526,7 +526,7 @@ out:
return;
}
-int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -569,7 +569,7 @@ void udpv6_encap_enable(void)
}
EXPORT_SYMBOL(udpv6_encap_enable);
-int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -879,7 +879,8 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
struct sock *sk;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- if (INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
return sk;
/* Only check first socket in chain */
break;
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index e78bdc76dcc3..f180b3d85e31 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,7 +26,6 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
-int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udpv6_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index ac858c480f2f..a2267f80febb 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
u8 frag_hdr_sz = sizeof(struct frag_hdr);
__wsum csum;
int tnl_hlen;
+ int err;
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
@@ -90,7 +91,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
/* Find the unfragmentable header and shift it left by frag_hdr_sz
* bytes to insert fragment header.
*/
- unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ return ERR_PTR(err);
+ unfrag_ip6hlen = err;
nexthdr = *prevhdr;
*prevhdr = NEXTHDR_FRAGMENT;
unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 08a807b29298..3ef5d913e7a3 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -43,8 +43,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
return 1;
#endif
- ipv6_hdr(skb)->payload_len = htons(skb->len);
__skb_push(skb, skb->data - skb_network_header(skb));
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
if (xo && (xo->flags & XFRM_GRO)) {
skb_mac_header_rebuild(skb);
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index 0e015906f9ca..07d36573f50b 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -47,6 +47,8 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
iph = ipv6_hdr(skb);
hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
+ if (hdr_len < 0)
+ return hdr_len;
skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
skb_set_network_header(skb, -x->props.header_len);
skb->transport_header = skb->network_header + hdr_len;
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 7a92c0f31912..9ad07a91708e 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -30,6 +30,8 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
skb_set_inner_transport_header(skb, skb_transport_offset(skb));
hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
+ if (hdr_len < 0)
+ return hdr_len;
skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
skb_set_network_header(skb, -x->props.header_len);
skb->transport_header = skb->network_header + hdr_len;
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 74d09f91709e..3be852808a9d 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -65,7 +65,7 @@ static void irlan_eth_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &irlan_eth_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->min_mtu = 0;
dev->max_mtu = ETH_MAX_MTU;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c1950bb14735..b1432b668033 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1157,6 +1157,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
goto out;
}
+ err = -ENOBUFS;
key = ext_hdrs[SADB_EXT_KEY_AUTH - 1];
if (sa->sadb_sa_auth) {
int keysize = 0;
@@ -1168,8 +1169,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
if (key)
keysize = (key->sadb_key_bits + 7) / 8;
x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL);
- if (!x->aalg)
+ if (!x->aalg) {
+ err = -ENOMEM;
goto out;
+ }
strcpy(x->aalg->alg_name, a->name);
x->aalg->alg_key_len = 0;
if (key) {
@@ -1188,8 +1191,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
goto out;
}
x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL);
- if (!x->calg)
+ if (!x->calg) {
+ err = -ENOMEM;
goto out;
+ }
strcpy(x->calg->alg_name, a->name);
x->props.calgo = sa->sadb_sa_encrypt;
} else {
@@ -1203,8 +1208,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
if (key)
keysize = (key->sadb_key_bits + 7) / 8;
x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL);
- if (!x->ealg)
+ if (!x->ealg) {
+ err = -ENOMEM;
goto out;
+ }
strcpy(x->ealg->alg_name, a->name);
x->ealg->alg_key_len = 0;
if (key) {
@@ -1249,8 +1256,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
struct xfrm_encap_tmpl *natt;
x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
- if (!x->encap)
+ if (!x->encap) {
+ err = -ENOMEM;
goto out;
+ }
natt = x->encap;
n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1];
@@ -2755,6 +2764,8 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad
int err, err2;
err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true);
+ if (!err)
+ xfrm_garbage_collect(net);
err2 = unicast_flush_resp(sk, hdr);
if (err || err2) {
if (err == -ESRCH) /* empty table - old silent behavior */
@@ -3285,7 +3296,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
p += pol->sadb_x_policy_len*8;
sec_ctx = (struct sadb_x_sec_ctx *)p;
if (len < pol->sadb_x_policy_len*8 +
- sec_ctx->sadb_x_sec_len) {
+ sec_ctx->sadb_x_sec_len*8) {
*dir = -EINVAL;
goto out;
}
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 8b21af7321b9..4de2ec94b08c 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -114,12 +114,13 @@ static void l2tp_eth_get_stats64(struct net_device *dev,
{
struct l2tp_eth *priv = netdev_priv(dev);
- stats->tx_bytes = atomic_long_read(&priv->tx_bytes);
- stats->tx_packets = atomic_long_read(&priv->tx_packets);
- stats->tx_dropped = atomic_long_read(&priv->tx_dropped);
- stats->rx_bytes = atomic_long_read(&priv->rx_bytes);
- stats->rx_packets = atomic_long_read(&priv->rx_packets);
- stats->rx_errors = atomic_long_read(&priv->rx_errors);
+ stats->tx_bytes = (unsigned long) atomic_long_read(&priv->tx_bytes);
+ stats->tx_packets = (unsigned long) atomic_long_read(&priv->tx_packets);
+ stats->tx_dropped = (unsigned long) atomic_long_read(&priv->tx_dropped);
+ stats->rx_bytes = (unsigned long) atomic_long_read(&priv->rx_bytes);
+ stats->rx_packets = (unsigned long) atomic_long_read(&priv->rx_packets);
+ stats->rx_errors = (unsigned long) atomic_long_read(&priv->rx_errors);
+
}
static const struct net_device_ops l2tp_eth_netdev_ops = {
@@ -141,7 +142,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev)
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->features |= NETIF_F_LLTX;
dev->netdev_ops = &l2tp_eth_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cb4fff785cbf..c38d16f22d2a 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
.name = "LLC",
.owner = THIS_MODULE,
.obj_size = sizeof(struct llc_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
};
/**
@@ -311,6 +311,8 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
int rc = -EINVAL;
dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
+
+ lock_sock(sk);
if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)))
goto out;
rc = -EAFNOSUPPORT;
@@ -382,6 +384,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
out_put:
llc_sap_put(sap);
out:
+ release_sock(sk);
return rc;
}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1bd2d45..9b02c13d258b 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_estab_match(sap, daddr, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_listener_match(sap, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d195cc..63b6ab056370 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_dgram_match(sap, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 60e2a62f7bef..cf2392b2ac71 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -7,7 +7,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
- * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2015-2017 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -741,46 +741,43 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
ieee80211_agg_start_txq(sta, tid, true);
}
-void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid)
+void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid,
+ struct tid_ampdu_tx *tid_tx)
{
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- struct tid_ampdu_tx *tid_tx;
- trace_api_start_tx_ba_cb(sdata, ra, tid);
+ if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
+ return;
+
+ if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
+ ieee80211_agg_tx_operational(local, sta, tid);
+}
+
+static struct tid_ampdu_tx *
+ieee80211_lookup_tid_tx(struct ieee80211_sub_if_data *sdata,
+ const u8 *ra, u16 tid, struct sta_info **sta)
+{
+ struct tid_ampdu_tx *tid_tx;
if (tid >= IEEE80211_NUM_TIDS) {
ht_dbg(sdata, "Bad TID value: tid = %d (>= %d)\n",
tid, IEEE80211_NUM_TIDS);
- return;
+ return NULL;
}
- mutex_lock(&local->sta_mtx);
- sta = sta_info_get_bss(sdata, ra);
- if (!sta) {
- mutex_unlock(&local->sta_mtx);
+ *sta = sta_info_get_bss(sdata, ra);
+ if (!*sta) {
ht_dbg(sdata, "Could not find station: %pM\n", ra);
- return;
+ return NULL;
}
- mutex_lock(&sta->ampdu_mlme.mtx);
- tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+ tid_tx = rcu_dereference((*sta)->ampdu_mlme.tid_tx[tid]);
- if (WARN_ON(!tid_tx)) {
+ if (WARN_ON(!tid_tx))
ht_dbg(sdata, "addBA was not requested!\n");
- goto unlock;
- }
- if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
- goto unlock;
-
- if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
- ieee80211_agg_tx_operational(local, sta, tid);
-
- unlock:
- mutex_unlock(&sta->ampdu_mlme.mtx);
- mutex_unlock(&local->sta_mtx);
+ return tid_tx;
}
void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
@@ -788,19 +785,20 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct ieee80211_local *local = sdata->local;
- struct ieee80211_ra_tid *ra_tid;
- struct sk_buff *skb = dev_alloc_skb(0);
+ struct sta_info *sta;
+ struct tid_ampdu_tx *tid_tx;
- if (unlikely(!skb))
- return;
+ trace_api_start_tx_ba_cb(sdata, ra, tid);
- ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
- memcpy(&ra_tid->ra, ra, ETH_ALEN);
- ra_tid->tid = tid;
+ rcu_read_lock();
+ tid_tx = ieee80211_lookup_tid_tx(sdata, ra, tid, &sta);
+ if (!tid_tx)
+ goto out;
- skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_START;
- skb_queue_tail(&sdata->skb_queue, skb);
- ieee80211_queue_work(&local->hw, &sdata->work);
+ set_bit(HT_AGG_STATE_START_CB, &tid_tx->state);
+ ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ out:
+ rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
@@ -860,37 +858,18 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
}
EXPORT_SYMBOL(ieee80211_stop_tx_ba_session);
-void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
+void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid,
+ struct tid_ampdu_tx *tid_tx)
{
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
- struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
bool send_delba = false;
- trace_api_stop_tx_ba_cb(sdata, ra, tid);
-
- if (tid >= IEEE80211_NUM_TIDS) {
- ht_dbg(sdata, "Bad TID value: tid = %d (>= %d)\n",
- tid, IEEE80211_NUM_TIDS);
- return;
- }
-
- ht_dbg(sdata, "Stopping Tx BA session for %pM tid %d\n", ra, tid);
-
- mutex_lock(&local->sta_mtx);
-
- sta = sta_info_get_bss(sdata, ra);
- if (!sta) {
- ht_dbg(sdata, "Could not find station: %pM\n", ra);
- goto unlock;
- }
+ ht_dbg(sdata, "Stopping Tx BA session for %pM tid %d\n",
+ sta->sta.addr, tid);
- mutex_lock(&sta->ampdu_mlme.mtx);
spin_lock_bh(&sta->lock);
- tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
- if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+ if (!test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
ht_dbg(sdata,
"unexpected callback to A-MPDU stop for %pM tid %d\n",
sta->sta.addr, tid);
@@ -906,12 +885,8 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
spin_unlock_bh(&sta->lock);
if (send_delba)
- ieee80211_send_delba(sdata, ra, tid,
+ ieee80211_send_delba(sdata, sta->sta.addr, tid,
WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
-
- mutex_unlock(&sta->ampdu_mlme.mtx);
- unlock:
- mutex_unlock(&local->sta_mtx);
}
void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
@@ -919,19 +894,20 @@ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct ieee80211_local *local = sdata->local;
- struct ieee80211_ra_tid *ra_tid;
- struct sk_buff *skb = dev_alloc_skb(0);
+ struct sta_info *sta;
+ struct tid_ampdu_tx *tid_tx;
- if (unlikely(!skb))
- return;
+ trace_api_stop_tx_ba_cb(sdata, ra, tid);
- ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
- memcpy(&ra_tid->ra, ra, ETH_ALEN);
- ra_tid->tid = tid;
+ rcu_read_lock();
+ tid_tx = ieee80211_lookup_tid_tx(sdata, ra, tid, &sta);
+ if (!tid_tx)
+ goto out;
- skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_STOP;
- skb_queue_tail(&sdata->skb_queue, skb);
- ieee80211_queue_work(&local->hw, &sdata->work);
+ set_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state);
+ ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ out:
+ rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 6c2e6060cd54..4a388fe8c2d1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -902,6 +902,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
default:
return -EINVAL;
}
+ sdata->u.ap.req_smps = sdata->smps_mode;
+
sdata->needed_rx_chains = sdata->local->rx_chains;
sdata->vif.bss_conf.beacon_int = params->beacon_interval;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index f4a528773563..6ca5442b1e03 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -7,6 +7,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
+ * Copyright 2017 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -289,8 +290,6 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
{
int i;
- cancel_work_sync(&sta->ampdu_mlme.work);
-
for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
__ieee80211_stop_tx_ba_session(sta, i, reason);
__ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
@@ -298,6 +297,9 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
reason != AGG_STOP_DESTROY_STA &&
reason != AGG_STOP_PEER_REQUEST);
}
+
+ /* stopping might queue the work again - so cancel only afterwards */
+ cancel_work_sync(&sta->ampdu_mlme.work);
}
void ieee80211_ba_session_work(struct work_struct *work)
@@ -352,10 +354,16 @@ void ieee80211_ba_session_work(struct work_struct *work)
spin_unlock_bh(&sta->lock);
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
- if (tid_tx && test_and_clear_bit(HT_AGG_STATE_WANT_STOP,
- &tid_tx->state))
+ if (!tid_tx)
+ continue;
+
+ if (test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state))
+ ieee80211_start_tx_ba_cb(sta, tid, tid_tx);
+ if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state))
___ieee80211_stop_tx_ba_session(sta, tid,
AGG_STOP_LOCAL_REQUEST);
+ if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
+ ieee80211_stop_tx_ba_cb(sta, tid, tid_tx);
}
mutex_unlock(&sta->ampdu_mlme.mtx);
}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f8f6c148f554..5e002f62c235 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1036,8 +1036,6 @@ struct ieee80211_rx_agg {
enum sdata_queue_type {
IEEE80211_SDATA_QUEUE_TYPE_FRAME = 0,
- IEEE80211_SDATA_QUEUE_AGG_START = 1,
- IEEE80211_SDATA_QUEUE_AGG_STOP = 2,
IEEE80211_SDATA_QUEUE_RX_AGG_START = 3,
IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4,
};
@@ -1427,12 +1425,6 @@ ieee80211_get_sband(struct ieee80211_sub_if_data *sdata)
return local->hw.wiphy->bands[band];
}
-/* this struct represents 802.11n's RA/TID combination */
-struct ieee80211_ra_tid {
- u8 ra[ETH_ALEN];
- u16 tid;
-};
-
/* this struct holds the value parsing from channel switch IE */
struct ieee80211_csa_ie {
struct cfg80211_chan_def chandef;
@@ -1539,7 +1531,7 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
return true;
/* can't handle non-legacy preamble yet */
if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
- status->encoding != RX_ENC_LEGACY)
+ status->encoding == RX_ENC_LEGACY)
return true;
return false;
}
@@ -1794,8 +1786,10 @@ int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
enum ieee80211_agg_stop_reason reason);
int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
enum ieee80211_agg_stop_reason reason);
-void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid);
-void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid);
+void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid,
+ struct tid_ampdu_tx *tid_tx);
+void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid,
+ struct tid_ampdu_tx *tid_tx);
void ieee80211_ba_session_work(struct work_struct *work);
void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 3bd5b81f5d81..f5f50150ba1c 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1213,7 +1213,6 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
static void ieee80211_if_free(struct net_device *dev)
{
free_percpu(dev->tstats);
- free_netdev(dev);
}
static void ieee80211_if_setup(struct net_device *dev)
@@ -1221,7 +1220,8 @@ static void ieee80211_if_setup(struct net_device *dev)
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->netdev_ops = &ieee80211_dataif_ops;
- dev->destructor = ieee80211_if_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ieee80211_if_free;
}
static void ieee80211_if_setup_no_queue(struct net_device *dev)
@@ -1237,7 +1237,6 @@ static void ieee80211_iface_work(struct work_struct *work)
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct sta_info *sta;
- struct ieee80211_ra_tid *ra_tid;
struct ieee80211_rx_agg *rx_agg;
if (!ieee80211_sdata_running(sdata))
@@ -1253,15 +1252,7 @@ static void ieee80211_iface_work(struct work_struct *work)
while ((skb = skb_dequeue(&sdata->skb_queue))) {
struct ieee80211_mgmt *mgmt = (void *)skb->data;
- if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_START) {
- ra_tid = (void *)&skb->cb;
- ieee80211_start_tx_ba_cb(&sdata->vif, ra_tid->ra,
- ra_tid->tid);
- } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_STOP) {
- ra_tid = (void *)&skb->cb;
- ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra,
- ra_tid->tid);
- } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_START) {
+ if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_START) {
rx_agg = (void *)&skb->cb;
mutex_lock(&local->sta_mtx);
sta = sta_info_get_bss(sdata, rx_agg->addr);
@@ -1825,6 +1816,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
ret = dev_alloc_name(ndev, ndev->name);
if (ret < 0) {
ieee80211_if_free(ndev);
+ free_netdev(ndev);
return ret;
}
@@ -1914,7 +1906,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
ret = register_netdevice(ndev);
if (ret) {
- ieee80211_if_free(ndev);
+ free_netdev(ndev);
return ret;
}
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0ea9712bd99e..cc8e6ea1b27e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -601,7 +601,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
struct ieee80211_supported_band *sband;
struct ieee80211_chanctx_conf *chanctx_conf;
struct ieee80211_channel *chan;
- u32 rate_flags, rates = 0;
+ u32 rates = 0;
sdata_assert_lock(sdata);
@@ -612,7 +612,6 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
return;
}
chan = chanctx_conf->def.chan;
- rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
rcu_read_unlock();
sband = local->hw.wiphy->bands[chan->band];
shift = ieee80211_vif_get_shift(&sdata->vif);
@@ -636,9 +635,6 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
*/
rates_len = 0;
for (i = 0; i < sband->n_bitrates; i++) {
- if ((rate_flags & sband->bitrates[i].flags)
- != rate_flags)
- continue;
rates |= BIT(i);
rates_len++;
}
@@ -2818,7 +2814,7 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
u32 *rates, u32 *basic_rates,
bool *have_higher_than_11mbit,
int *min_rate, int *min_rate_index,
- int shift, u32 rate_flags)
+ int shift)
{
int i, j;
@@ -2846,8 +2842,6 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
int brate;
br = &sband->bitrates[j];
- if ((rate_flags & br->flags) != rate_flags)
- continue;
brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5);
if (brate == rate) {
@@ -4398,40 +4392,32 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
return -ENOMEM;
}
- if (new_sta || override) {
- err = ieee80211_prep_channel(sdata, cbss);
- if (err) {
- if (new_sta)
- sta_info_free(local, new_sta);
- return -EINVAL;
- }
- }
-
+ /*
+ * Set up the information for the new channel before setting the
+ * new channel. We can't - completely race-free - change the basic
+ * rates bitmap and the channel (sband) that it refers to, but if
+ * we set it up before we at least avoid calling into the driver's
+ * bss_info_changed() method with invalid information (since we do
+ * call that from changing the channel - only for IDLE and perhaps
+ * some others, but ...).
+ *
+ * So to avoid that, just set up all the new information before the
+ * channel, but tell the driver to apply it only afterwards, since
+ * it might need the new channel for that.
+ */
if (new_sta) {
u32 rates = 0, basic_rates = 0;
bool have_higher_than_11mbit;
int min_rate = INT_MAX, min_rate_index = -1;
- struct ieee80211_chanctx_conf *chanctx_conf;
const struct cfg80211_bss_ies *ies;
int shift = ieee80211_vif_get_shift(&sdata->vif);
- u32 rate_flags;
-
- rcu_read_lock();
- chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
- if (WARN_ON(!chanctx_conf)) {
- rcu_read_unlock();
- sta_info_free(local, new_sta);
- return -EINVAL;
- }
- rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
- rcu_read_unlock();
ieee80211_get_rates(sband, bss->supp_rates,
bss->supp_rates_len,
&rates, &basic_rates,
&have_higher_than_11mbit,
&min_rate, &min_rate_index,
- shift, rate_flags);
+ shift);
/*
* This used to be a workaround for basic rates missing
@@ -4489,8 +4475,22 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.sync_dtim_count = 0;
}
rcu_read_unlock();
+ }
- /* tell driver about BSSID, basic rates and timing */
+ if (new_sta || override) {
+ err = ieee80211_prep_channel(sdata, cbss);
+ if (err) {
+ if (new_sta)
+ sta_info_free(local, new_sta);
+ return -EINVAL;
+ }
+ }
+
+ if (new_sta) {
+ /*
+ * tell driver about BSSID, basic rates and timing
+ * this was set up above, before setting the channel
+ */
ieee80211_bss_info_change_notify(sdata,
BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES |
BSS_CHANGED_BEACON_INT);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 35f4c7d7a500..3674fe3d67dc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1613,12 +1613,16 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
*/
if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) &&
!ieee80211_has_morefrags(hdr->frame_control) &&
+ !ieee80211_is_back_req(hdr->frame_control) &&
!(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
(rx->sdata->vif.type == NL80211_IFTYPE_AP ||
rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
- /* PM bit is only checked in frames where it isn't reserved,
+ /*
+ * PM bit is only checked in frames where it isn't reserved,
* in AP mode it's reserved in non-bufferable management frames
* (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field)
+ * BAR frames should be ignored as specified in
+ * IEEE 802.11-2012 10.2.1.2.
*/
(!ieee80211_is_mgmt(hdr->frame_control) ||
ieee80211_is_bufferable_mmpdu(hdr->frame_control))) {
@@ -2492,7 +2496,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
if (is_multicast_ether_addr(hdr->addr1)) {
mpp_addr = hdr->addr3;
proxied_addr = mesh_hdr->eaddr1;
- } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) {
+ } else if ((mesh_hdr->flags & MESH_FLAGS_AE) ==
+ MESH_FLAGS_AE_A5_A6) {
/* has_a4 already checked in ieee80211_rx_mesh_check */
mpp_addr = hdr->addr4;
proxied_addr = mesh_hdr->eaddr2;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 7cdf7a835bb0..403e3cc58b57 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2155,7 +2155,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
struct ieee80211_sta_rx_stats *cpurxs;
cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu);
- sinfo->rx_packets += cpurxs->dropped;
+ sinfo->rx_dropped_misc += cpurxs->dropped;
}
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 5609cacb20d5..ea0747d6a6da 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -116,6 +116,8 @@ enum ieee80211_sta_info_flags {
#define HT_AGG_STATE_STOPPING 3
#define HT_AGG_STATE_WANT_START 4
#define HT_AGG_STATE_WANT_STOP 5
+#define HT_AGG_STATE_START_CB 6
+#define HT_AGG_STATE_STOP_CB 7
enum ieee80211_agg_stop_reason {
AGG_STOP_DECLINED,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index c1ef22df865f..cc19614ff4e6 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -17,6 +17,7 @@
#include <asm/unaligned.h>
#include <net/mac80211.h>
#include <crypto/aes.h>
+#include <crypto/algapi.h>
#include "ieee80211_i.h"
#include "michael.h"
@@ -153,7 +154,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
data_len = skb->len - hdrlen - MICHAEL_MIC_LEN;
key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY];
michael_mic(key, hdr, data, data_len, mic);
- if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0)
+ if (crypto_memneq(mic, data + data_len, MICHAEL_MIC_LEN))
goto mic_fail;
/* remove Michael MIC from payload */
@@ -1048,7 +1049,7 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
bip_aad(skb, aad);
ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad,
skb->data + 24, skb->len - 24, mic);
- if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_cmac.icverrors++;
return RX_DROP_UNUSABLE;
}
@@ -1098,7 +1099,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
bip_aad(skb, aad);
ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
skb->data + 24, skb->len - 24, mic);
- if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_cmac.icverrors++;
return RX_DROP_UNUSABLE;
}
@@ -1202,7 +1203,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce,
skb->data + 24, skb->len - 24,
mic) < 0 ||
- memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_gmac.icverrors++;
return RX_DROP_UNUSABLE;
}
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 06019dba4b10..bd88a9b80773 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -526,8 +526,6 @@ static void mac802154_wpan_free(struct net_device *dev)
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
mac802154_llsec_destroy(&sdata->sec);
-
- free_netdev(dev);
}
static void ieee802154_if_setup(struct net_device *dev)
@@ -593,7 +591,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
sdata->dev->dev_addr);
sdata->dev->header_ops = &mac802154_header_ops;
- sdata->dev->destructor = mac802154_wpan_free;
+ sdata->dev->needs_free_netdev = true;
+ sdata->dev->priv_destructor = mac802154_wpan_free;
sdata->dev->netdev_ops = &mac802154_wpan_ops;
sdata->dev->ml_priv = &mac802154_mlme_wpan;
wpan_dev->promiscuous_mode = false;
@@ -608,7 +607,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
break;
case NL802154_IFTYPE_MONITOR:
- sdata->dev->destructor = free_netdev;
+ sdata->dev->needs_free_netdev = true;
sdata->dev->netdev_ops = &mac802154_monitor_ops;
wpan_dev->promiscuous_mode = true;
break;
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 257ec66009da..7b05fd1497ce 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1418,7 +1418,7 @@ static void mpls_ifup(struct net_device *dev, unsigned int flags)
continue;
alive++;
nh_flags &= ~flags;
- WRITE_ONCE(nh->nh_flags, flags);
+ WRITE_ONCE(nh->nh_flags, nh_flags);
} endfor_nexthops(rt);
WRITE_ONCE(rt->rt_nhn_alive, alive);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index d2d7bdf1d510..ad99c1ceea6f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -849,10 +849,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
{
unsigned int verdict = NF_DROP;
- if (IP_VS_FWD_METHOD(cp) != 0) {
- pr_err("shouldn't reach here, because the box is on the "
- "half connection in the tun/dr module.\n");
- }
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto ignore_cp;
/* Ensure the checksum is correct */
if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
@@ -886,6 +884,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
+
+ignore_cp:
verdict = NF_ACCEPT;
out:
@@ -1385,8 +1385,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
*/
cp = pp->conn_out_get(ipvs, af, skb, &iph);
- if (likely(cp))
+ if (likely(cp)) {
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto ignore_cp;
return handle_response(af, skb, pd, cp, &iph, hooknum);
+ }
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
@@ -1444,9 +1447,15 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
}
}
}
+
+out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
+
+ignore_cp:
+ __ip_vs_conn_put(cp);
+ goto out;
}
/*
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3c8f1ed2f555..e847dbaa0c6b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -911,7 +911,7 @@ static unsigned int early_drop_list(struct net *net,
continue;
/* kill only if still in same netns -- might have moved due to
- * SLAB_DESTROY_BY_RCU rules.
+ * SLAB_TYPESAFE_BY_RCU rules.
*
* We steal the timer reference. If that fails timer has
* already fired or someone else deleted it. Just drop ref
@@ -1114,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net,
/*
* Do not use kmem_cache_zalloc(), as this cache uses
- * SLAB_DESTROY_BY_RCU.
+ * SLAB_TYPESAFE_BY_RCU.
*/
ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
@@ -1159,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct)
struct net *net = nf_ct_net(ct);
/* A freed object has refcnt == 0, that's
- * the golden rule for SLAB_DESTROY_BY_RCU
+ * the golden rule for SLAB_TYPESAFE_BY_RCU
*/
NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
@@ -1929,7 +1929,7 @@ int nf_conntrack_init_start(void)
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
NFCT_INFOMASK + 1,
- SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
if (!nf_conntrack_cachep)
goto err_cachep;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 3a60efa7799b..7f6100ca63be 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -174,6 +174,10 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
#endif
if (h != NULL && !try_module_get(h->me))
h = NULL;
+ if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) {
+ module_put(h->me);
+ h = NULL;
+ }
rcu_read_unlock();
@@ -181,6 +185,13 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
+void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
+{
+ refcount_dec(&helper->refcnt);
+ module_put(helper->me);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
+
struct nf_conn_help *
nf_ct_helper_ext_add(struct nf_conn *ct,
struct nf_conntrack_helper *helper, gfp_t gfp)
@@ -417,6 +428,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
}
}
}
+ refcount_set(&me->refcnt, 1);
hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
nf_ct_helper_count++;
out:
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index dcf561b5c97a..a8be9b72e6cd 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -45,6 +45,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -888,8 +890,13 @@ restart:
}
out:
local_bh_enable();
- if (last)
+ if (last) {
+ /* nf ct hash resize happened, now clear the leftover. */
+ if ((struct nf_conn *)cb->args[1] == last)
+ cb->args[1] = 0;
+
nf_ct_put(last);
+ }
while (i) {
i--;
@@ -1007,9 +1014,8 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
- struct nf_conntrack_tuple *tuple,
- enum ctattr_type type, u_int8_t l3num,
- struct nf_conntrack_zone *zone)
+ struct nf_conntrack_tuple *tuple, u32 type,
+ u_int8_t l3num, struct nf_conntrack_zone *zone)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -1828,6 +1834,8 @@ ctnetlink_create_conntrack(struct net *net,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+ nfct_seqadj_ext_add(ct);
+ nfct_synproxy_ext_add(ct);
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
@@ -2447,7 +2455,7 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = {
static int ctnetlink_exp_dump_tuple(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
- enum ctattr_expect type)
+ u32 type)
{
struct nlattr *nest_parms;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 13875d599a85..1c5b14a6cab3 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -512,16 +512,19 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
u8 pf, unsigned int hooknum)
{
const struct sctphdr *sh;
- struct sctphdr _sctph;
const char *logmsg;
- sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
- if (!sh) {
+ if (skb->len < dataoff + sizeof(struct sctphdr)) {
logmsg = "nf_ct_sctp: short packet ";
goto out_invalid;
}
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
skb->ip_summed == CHECKSUM_NONE) {
+ if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
+ logmsg = "nf_ct_sctp: failed to read header ";
+ goto out_invalid;
+ }
+ sh = (const struct sctphdr *)(skb->data + dataoff);
if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
logmsg = "nf_ct_sctp: bad CRC ";
goto out_invalid;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index b48d6b5aae8a..6c72922d20ca 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -409,6 +409,10 @@ nf_nat_setup_info(struct nf_conn *ct,
{
struct nf_conntrack_tuple curr_tuple, new_tuple;
+ /* Can't setup nat info for confirmed ct. */
+ if (nf_ct_is_confirmed(ct))
+ return NF_ACCEPT;
+
NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
maniptype == NF_NAT_MANIP_DST);
BUG_ON(nf_nat_initialized(ct, maniptype));
@@ -562,7 +566,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
* will delete entry from already-freed table.
*/
- ct->status &= ~IPS_NAT_DONE_MASK;
+ clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
nf_nat_bysource_params);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 559225029740..da314be0c048 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3367,35 +3367,50 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
return nf_tables_fill_setelem(args->skb, set, elem);
}
+struct nft_set_dump_ctx {
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+};
+
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nft_set_dump_ctx *dump_ctx = cb->data;
struct net *net = sock_net(skb->sk);
- u8 genmask = nft_genmask_cur(net);
+ struct nft_af_info *afi;
+ struct nft_table *table;
struct nft_set *set;
struct nft_set_dump_args args;
- struct nft_ctx ctx;
- struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
+ bool set_found = false;
struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
u32 portid, seq;
- int event, err;
+ int event;
- err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla,
- NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy,
- NULL);
- if (err < 0)
- return err;
+ rcu_read_lock();
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (afi != dump_ctx->ctx.afi)
+ continue;
- err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
- (void *)nla, genmask);
- if (err < 0)
- return err;
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (table != dump_ctx->ctx.table)
+ continue;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
- genmask);
- if (IS_ERR(set))
- return PTR_ERR(set);
+ list_for_each_entry_rcu(set, &table->sets, list) {
+ if (set == dump_ctx->set) {
+ set_found = true;
+ break;
+ }
+ }
+ break;
+ }
+ break;
+ }
+
+ if (!set_found) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM);
portid = NETLINK_CB(cb->skb).portid;
@@ -3407,11 +3422,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
goto nla_put_failure;
nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx.afi->family;
+ nfmsg->nfgen_family = afi->family;
nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff);
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
- if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name))
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
goto nla_put_failure;
@@ -3422,12 +3437,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
args.cb = cb;
args.skb = skb;
- args.iter.genmask = nft_genmask_cur(ctx.net);
+ args.iter.genmask = nft_genmask_cur(net);
args.iter.skip = cb->args[0];
args.iter.count = 0;
args.iter.err = 0;
args.iter.fn = nf_tables_dump_setelem;
- set->ops->walk(&ctx, set, &args.iter);
+ set->ops->walk(&dump_ctx->ctx, set, &args.iter);
+ rcu_read_unlock();
nla_nest_end(skb, nest);
nlmsg_end(skb, nlh);
@@ -3441,9 +3457,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
nla_put_failure:
+ rcu_read_unlock();
return -ENOSPC;
}
+static int nf_tables_dump_set_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -3465,7 +3488,18 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_set,
+ .done = nf_tables_dump_set_done,
};
+ struct nft_set_dump_ctx *dump_ctx;
+
+ dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+ if (!dump_ctx)
+ return -ENOMEM;
+
+ dump_ctx->set = set;
+ dump_ctx->ctx = ctx;
+
+ c.data = dump_ctx;
return netlink_dump_start(nlsk, skb, nlh, &c);
}
return -EOPNOTSUPP;
@@ -3593,9 +3627,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
- nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE);
+ nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- nft_data_uninit(nft_set_ext_data(ext), set->dtype);
+ nft_data_release(nft_set_ext_data(ext), set->dtype);
if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
@@ -3604,6 +3638,18 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
+/* Only called from commit path, nft_set_elem_deactivate() already deals with
+ * the refcounting from the preparation phase.
+ */
+static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+ kfree(elem);
+}
+
static int nft_setelem_parse_flags(const struct nft_set *set,
const struct nlattr *attr, u32 *flags)
{
@@ -3815,9 +3861,9 @@ err4:
kfree(elem.priv);
err3:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
- nft_data_uninit(&data, d2.type);
+ nft_data_release(&data, d2.type);
err2:
- nft_data_uninit(&elem.key.val, d1.type);
+ nft_data_release(&elem.key.val, d1.type);
err1:
return err;
}
@@ -3862,6 +3908,53 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
return err;
}
+/**
+ * nft_data_hold - hold a nft_data item
+ *
+ * @data: struct nft_data to release
+ * @type: type of data
+ *
+ * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and
+ * NFT_GOTO verdicts. This function must be called on active data objects
+ * from the second phase of the commit protocol.
+ */
+static void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
+{
+ if (type == NFT_DATA_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ data->verdict.chain->use++;
+ break;
+ }
+ }
+}
+
+static void nft_set_elem_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_hold(nft_set_ext_data(ext), set->dtype);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use++;
+}
+
+static void nft_set_elem_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_release(nft_set_ext_data(ext), set->dtype);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use--;
+}
+
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
@@ -3927,6 +4020,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
kfree(elem.priv);
elem.priv = priv;
+ nft_set_elem_deactivate(ctx->net, set, &elem);
+
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
@@ -3936,7 +4031,7 @@ err4:
err3:
kfree(elem.priv);
err2:
- nft_data_uninit(&elem.key.val, desc.type);
+ nft_data_release(&elem.key.val, desc.type);
err1:
return err;
}
@@ -4743,8 +4838,8 @@ static void nf_tables_commit_release(struct nft_trans *trans)
nft_set_destroy(nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv, true);
+ nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+ nft_trans_elem(trans).priv);
break;
case NFT_MSG_DELOBJ:
nft_obj_destroy(nft_trans_obj(trans));
@@ -4979,6 +5074,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
case NFT_MSG_DELSETELEM:
te = (struct nft_trans_elem *)trans->data;
+ nft_set_elem_activate(net, te->set, &te->elem);
te->set->ops->activate(net, te->set, &te->elem);
te->set->ndeact--;
@@ -5464,7 +5560,7 @@ int nft_data_init(const struct nft_ctx *ctx,
EXPORT_SYMBOL_GPL(nft_data_init);
/**
- * nft_data_uninit - release a nft_data item
+ * nft_data_release - release a nft_data item
*
* @data: struct nft_data to release
* @type: type of data
@@ -5472,7 +5568,7 @@ EXPORT_SYMBOL_GPL(nft_data_init);
* Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
* all others need to be released by calling this function.
*/
-void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+void nft_data_release(const struct nft_data *data, enum nft_data_types type)
{
if (type < NFT_DATA_VERDICT)
return;
@@ -5483,7 +5579,7 @@ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
WARN_ON(1);
}
}
-EXPORT_SYMBOL_GPL(nft_data_uninit);
+EXPORT_SYMBOL_GPL(nft_data_release);
int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
enum nft_data_types type, unsigned int len)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 950bf6eadc65..be678a323598 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -686,6 +686,7 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
tuple_set = true;
}
+ ret = -ENOENT;
list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
cur = &nlcth->helper;
j++;
@@ -699,16 +700,20 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
tuple.dst.protonum != cur->tuple.dst.protonum))
continue;
- found = true;
- nf_conntrack_helper_unregister(cur);
- kfree(cur->expect_policy);
+ if (refcount_dec_if_one(&cur->refcnt)) {
+ found = true;
+ nf_conntrack_helper_unregister(cur);
+ kfree(cur->expect_policy);
- list_del(&nlcth->list);
- kfree(nlcth);
+ list_del(&nlcth->list);
+ kfree(nlcth);
+ } else {
+ ret = -EBUSY;
+ }
}
/* Make sure we return success if we flush and there is no helpers */
- return (found || j == 0) ? 0 : -ENOENT;
+ return (found || j == 0) ? 0 : ret;
}
static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 877d9acd91ef..fff8073e2a56 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -83,17 +83,26 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (d1.len != priv->len)
- return -EINVAL;
+ if (d1.len != priv->len) {
+ err = -EINVAL;
+ goto err1;
+ }
err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
tb[NFTA_BITWISE_XOR]);
if (err < 0)
- return err;
- if (d2.len != priv->len)
- return -EINVAL;
+ goto err1;
+ if (d2.len != priv->len) {
+ err = -EINVAL;
+ goto err2;
+ }
return 0;
+err2:
+ nft_data_release(&priv->xor, d2.type);
+err1:
+ nft_data_release(&priv->mask, d1.type);
+ return err;
}
static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 2b96effeadc1..c2945eb3397c 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -201,10 +201,18 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
if (err < 0)
return ERR_PTR(err);
+ if (desc.type != NFT_DATA_VALUE) {
+ err = -EINVAL;
+ goto err1;
+ }
+
if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
return &nft_cmp_fast_ops;
- else
- return &nft_cmp_ops;
+
+ return &nft_cmp_ops;
+err1:
+ nft_data_release(&data, desc.type);
+ return ERR_PTR(-EINVAL);
}
struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index a34ceb38fc55..1678e9e75e8e 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,9 +826,9 @@ static void nft_ct_helper_obj_destroy(struct nft_object *obj)
struct nft_ct_helper_obj *priv = nft_obj_data(obj);
if (priv->helper4)
- module_put(priv->helper4->me);
+ nf_conntrack_helper_put(priv->helper4);
if (priv->helper6)
- module_put(priv->helper6->me);
+ nf_conntrack_helper_put(priv->helper6);
}
static void nft_ct_helper_obj_eval(struct nft_object *obj,
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 728baf88295a..4717d7796927 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -65,7 +65,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
return 0;
err1:
- nft_data_uninit(&priv->data, desc.type);
+ nft_data_release(&priv->data, desc.type);
return err;
}
@@ -73,7 +73,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
- return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+
+ return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
}
static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 9edc74eedc10..cedb96c3619f 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -102,9 +102,9 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
priv->len = desc_from.len;
return 0;
err2:
- nft_data_uninit(&priv->data_to, desc_to.type);
+ nft_data_release(&priv->data_to, desc_to.type);
err1:
- nft_data_uninit(&priv->data_from, desc_from.type);
+ nft_data_release(&priv->data_from, desc_from.type);
return err;
}
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 8ec086b6b56b..3d3a6df4ce70 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -222,7 +222,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_elem elem;
int err;
- err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC);
iter->err = err;
if (err)
return;
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e97e2fb53f0a..fbdbaa00dd5f 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -116,17 +116,17 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
else if (d > 0)
p = &parent->rb_right;
else {
- if (nft_set_elem_active(&rbe->ext, genmask)) {
- if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(new))
- p = &parent->rb_left;
- else if (!nft_rbtree_interval_end(rbe) &&
- nft_rbtree_interval_end(new))
- p = &parent->rb_right;
- else {
- *ext = &rbe->ext;
- return -EEXIST;
- }
+ if (nft_rbtree_interval_end(rbe) &&
+ !nft_rbtree_interval_end(new)) {
+ p = &parent->rb_left;
+ } else if (!nft_rbtree_interval_end(rbe) &&
+ nft_rbtree_interval_end(new)) {
+ p = &parent->rb_right;
+ } else if (nft_set_elem_active(&rbe->ext, genmask)) {
+ *ext = &rbe->ext;
+ return -EEXIST;
+ } else {
+ p = &parent->rb_left;
}
}
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8876b7da6884..1770c1d9b37f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -283,28 +283,30 @@ static int xt_obj_to_user(u16 __user *psize, u16 size,
&U->u.user.revision, K->u.kernel.TYPE->revision)
int xt_data_to_user(void __user *dst, const void *src,
- int usersize, int size)
+ int usersize, int size, int aligned_size)
{
usersize = usersize ? : size;
if (copy_to_user(dst, src, usersize))
return -EFAULT;
- if (usersize != size && clear_user(dst + usersize, size - usersize))
+ if (usersize != aligned_size &&
+ clear_user(dst + usersize, aligned_size - usersize))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(xt_data_to_user);
-#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
+#define XT_DATA_TO_USER(U, K, TYPE) \
xt_data_to_user(U->data, K->data, \
K->u.kernel.TYPE->usersize, \
- C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+ K->u.kernel.TYPE->TYPE##size, \
+ XT_ALIGN(K->u.kernel.TYPE->TYPE##size))
int xt_match_to_user(const struct xt_entry_match *m,
struct xt_entry_match __user *u)
{
return XT_OBJ_TO_USER(u, m, match, 0) ||
- XT_DATA_TO_USER(u, m, match, 0);
+ XT_DATA_TO_USER(u, m, match);
}
EXPORT_SYMBOL_GPL(xt_match_to_user);
@@ -312,7 +314,7 @@ int xt_target_to_user(const struct xt_entry_target *t,
struct xt_entry_target __user *u)
{
return XT_OBJ_TO_USER(u, t, target, 0) ||
- XT_DATA_TO_USER(u, t, target, 0);
+ XT_DATA_TO_USER(u, t, target);
}
EXPORT_SYMBOL_GPL(xt_target_to_user);
@@ -611,6 +613,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
}
EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
+#define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
+ xt_data_to_user(U->data, K->data, \
+ K->u.kernel.TYPE->usersize, \
+ C_SIZE, \
+ COMPAT_XT_ALIGN(C_SIZE))
+
int xt_compat_match_to_user(const struct xt_entry_match *m,
void __user **dstptr, unsigned int *size)
{
@@ -626,7 +634,7 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
if (match->compat_to_user((void __user *)cm->data, m->data))
return -EFAULT;
} else {
- if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
+ if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
return -EFAULT;
}
@@ -972,7 +980,7 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
if (target->compat_to_user((void __user *)ct->data, t->data))
return -EFAULT;
} else {
- if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
+ if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
return -EFAULT;
}
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index bb7ad82dcd56..623ef37de886 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
if (help == NULL) {
- module_put(helper->me);
+ nf_conntrack_helper_put(helper);
return -ENOMEM;
}
@@ -263,7 +263,7 @@ out:
err4:
help = nfct_help(ct);
if (help)
- module_put(help->helper->me);
+ nf_conntrack_helper_put(help->helper);
err3:
nf_ct_tmpl_free(ct);
err2:
@@ -346,7 +346,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (ct) {
help = nfct_help(ct);
if (help)
- module_put(help->helper->me);
+ nf_conntrack_helper_put(help->helper);
nf_ct_netns_put(par->net, par->family);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ee841f00a6ec..7586d446d7dc 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -62,6 +62,7 @@
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/genetlink.h>
+#include <linux/net_namespace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -1415,7 +1416,8 @@ static void do_one_broadcast(struct sock *sk,
goto out;
}
NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
- NETLINK_CB(p->skb2).nsid_is_set = true;
+ if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
+ NETLINK_CB(p->skb2).nsid_is_set = true;
val = netlink_broadcast_deliver(sk, p->skb2);
if (val < 0) {
netlink_overrun(sk);
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index bf602e33c40a..08679ebb3068 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1123,7 +1123,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
if (!help) {
- module_put(helper->me);
+ nf_conntrack_helper_put(helper);
return -ENOMEM;
}
@@ -1584,7 +1584,7 @@ void ovs_ct_free_action(const struct nlattr *a)
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
{
if (ct_info->helper)
- module_put(ct_info->helper->me);
+ nf_conntrack_helper_put(ct_info->helper);
if (ct_info->ct)
nf_ct_tmpl_free(ct_info->ct);
}
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 89193a634da4..04a3128adcf0 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -94,7 +94,6 @@ static void internal_dev_destructor(struct net_device *dev)
struct vport *vport = ovs_internal_dev_get_vport(dev);
ovs_vport_free(vport);
- free_netdev(dev);
}
static void
@@ -156,7 +155,8 @@ static void do_setup(struct net_device *netdev)
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
- netdev->destructor = internal_dev_destructor;
+ netdev->needs_free_netdev = true;
+ netdev->priv_destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f4001763134d..e3eeed19cc7a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2658,13 +2658,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
}
- sockc.tsflags = po->sk.sk_tsflags;
- if (msg->msg_controllen) {
- err = sock_cmsg_send(&po->sk, msg, &sockc);
- if (unlikely(err))
- goto out;
- }
-
err = -ENXIO;
if (unlikely(dev == NULL))
goto out;
@@ -2672,6 +2665,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
+ sockc.tsflags = po->sk.sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(&po->sk, msg, &sockc);
+ if (unlikely(err))
+ goto out_put;
+ }
+
if (po->sk.sk_socket->type == SOCK_RAW)
reserve = dev->hard_header_len;
size_max = po->tx_ring.frame_size
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index 21c28b51be94..2c9337946e30 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -236,7 +236,7 @@ static void gprs_setup(struct net_device *dev)
dev->tx_queue_len = 10;
dev->netdev_ops = &gprs_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
/*
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 0a4e28477ad9..54369225766e 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -217,7 +217,7 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
unsigned int *_toklen)
{
const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, n_parts, loop, tmp;
+ unsigned int toklen = *_toklen, n_parts, loop, tmp, paddedlen;
/* there must be at least one name, and at least #names+1 length
* words */
@@ -247,16 +247,16 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
toklen -= 4;
if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX)
return -EINVAL;
- if (tmp > toklen)
+ paddedlen = (tmp + 3) & ~3;
+ if (paddedlen > toklen)
return -EINVAL;
princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL);
if (!princ->name_parts[loop])
return -ENOMEM;
memcpy(princ->name_parts[loop], xdr, tmp);
princ->name_parts[loop][tmp] = 0;
- tmp = (tmp + 3) & ~3;
- toklen -= tmp;
- xdr += tmp >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
}
if (toklen < 4)
@@ -265,16 +265,16 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
toklen -= 4;
if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX)
return -EINVAL;
- if (tmp > toklen)
+ paddedlen = (tmp + 3) & ~3;
+ if (paddedlen > toklen)
return -EINVAL;
princ->realm = kmalloc(tmp + 1, GFP_KERNEL);
if (!princ->realm)
return -ENOMEM;
memcpy(princ->realm, xdr, tmp);
princ->realm[tmp] = 0;
- tmp = (tmp + 3) & ~3;
- toklen -= tmp;
- xdr += tmp >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
_debug("%s/...@%s", princ->name_parts[0], princ->realm);
@@ -293,7 +293,7 @@ static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
unsigned int *_toklen)
{
const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, len;
+ unsigned int toklen = *_toklen, len, paddedlen;
/* there must be at least one tag and one length word */
if (toklen <= 8)
@@ -307,15 +307,17 @@ static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
toklen -= 8;
if (len > max_data_size)
return -EINVAL;
+ paddedlen = (len + 3) & ~3;
+ if (paddedlen > toklen)
+ return -EINVAL;
td->data_len = len;
if (len > 0) {
td->data = kmemdup(xdr, len, GFP_KERNEL);
if (!td->data)
return -ENOMEM;
- len = (len + 3) & ~3;
- toklen -= len;
- xdr += len >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
}
_debug("tag %x len %x", td->tag, td->data_len);
@@ -387,7 +389,7 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
const __be32 **_xdr, unsigned int *_toklen)
{
const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, len;
+ unsigned int toklen = *_toklen, len, paddedlen;
/* there must be at least one length word */
if (toklen <= 4)
@@ -399,6 +401,9 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
toklen -= 4;
if (len > AFSTOKEN_K5_TIX_MAX)
return -EINVAL;
+ paddedlen = (len + 3) & ~3;
+ if (paddedlen > toklen)
+ return -EINVAL;
*_tktlen = len;
_debug("ticket len %u", len);
@@ -407,9 +412,8 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
*_ticket = kmemdup(xdr, len, GFP_KERNEL);
if (!*_ticket)
return -ENOMEM;
- len = (len + 3) & ~3;
- toklen -= len;
- xdr += len >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
}
*_xdr = xdr;
@@ -552,7 +556,7 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
{
const __be32 *xdr = prep->data, *token;
const char *cp;
- unsigned int len, tmp, loop, ntoken, toklen, sec_ix;
+ unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix;
size_t datalen = prep->datalen;
int ret;
@@ -578,22 +582,21 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
if (len < 1 || len > AFSTOKEN_CELL_MAX)
goto not_xdr;
datalen -= 4;
- tmp = (len + 3) & ~3;
- if (tmp > datalen)
+ paddedlen = (len + 3) & ~3;
+ if (paddedlen > datalen)
goto not_xdr;
cp = (const char *) xdr;
for (loop = 0; loop < len; loop++)
if (!isprint(cp[loop]))
goto not_xdr;
- if (len < tmp)
- for (; loop < tmp; loop++)
- if (cp[loop])
- goto not_xdr;
+ for (; loop < paddedlen; loop++)
+ if (cp[loop])
+ goto not_xdr;
_debug("cellname: [%u/%u] '%*.*s'",
- len, tmp, len, len, (const char *) xdr);
- datalen -= tmp;
- xdr += tmp >> 2;
+ len, paddedlen, len, len, (const char *) xdr);
+ datalen -= paddedlen;
+ xdr += paddedlen >> 2;
/* get the token count */
if (datalen < 12)
@@ -614,10 +617,11 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
sec_ix = ntohl(*xdr);
datalen -= 4;
_debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
- if (toklen < 20 || toklen > datalen)
+ paddedlen = (toklen + 3) & ~3;
+ if (toklen < 20 || toklen > datalen || paddedlen > datalen)
goto not_xdr;
- datalen -= (toklen + 3) & ~3;
- xdr += (toklen + 3) >> 2;
+ datalen -= paddedlen;
+ xdr += paddedlen >> 2;
} while (--loop > 0);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 164b5ac094be..7dc5892671c8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -94,8 +94,10 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
k++;
}
- if (n)
+ if (n) {
+ err = -EINVAL;
goto err_out;
+ }
return keys_ex;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index f42008b29311..b062bc80c7cb 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -132,21 +132,21 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
}
}
- spin_lock_bh(&police->tcf_lock);
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
&police->tcf_rate_est,
&police->tcf_lock,
NULL, est);
if (err)
- goto failure_unlock;
+ goto failure;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
!gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
- goto failure_unlock;
+ goto failure;
}
+ spin_lock_bh(&police->tcf_lock);
/* No failure allowed after this point */
police->tcfp_mtu = parm->mtu;
if (police->tcfp_mtu == 0) {
@@ -192,8 +192,6 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
return ret;
-failure_unlock:
- spin_unlock_bh(&police->tcf_lock);
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index dee469fed967..51859b8edd7e 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -203,7 +203,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
*arg = (unsigned long) head;
rcu_assign_pointer(tp->root, new);
- call_rcu(&head->rcu, mall_destroy_rcu);
return 0;
err_replace_hw_filter:
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index bbe57d57b67f..cfdbfa18a95e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1019,7 +1019,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
return sch;
}
/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
- ops->destroy(sch);
+ if (ops->destroy)
+ ops->destroy(sch);
err_out3:
dev_put(dev);
kfree((char *) sch - sch->padded);
@@ -1831,6 +1832,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
if (!qdisc_dev(root))
return 0;
+ if (tcm->tcm_parent) {
+ q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
+ if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+ return 0;
+ }
hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
return -1;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a9708da28eb5..95238284c422 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1176,7 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
asoc->ctsn_ack_point = asoc->next_tsn - 1;
asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
- if (!asoc->stream) {
+
+ if (sctp_state(asoc, COOKIE_WAIT)) {
+ sctp_stream_free(asoc->stream);
asoc->stream = new->stream;
new->stream = NULL;
}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 8c589230794f..3dcd0ecf3d99 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -275,6 +275,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
if (sctp_sk(sk)->bind_hash)
sctp_put_port(sk);
+ sctp_sk(sk)->ep = NULL;
sock_put(sk);
}
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 0e06a278d2a9..ba9ad32fc447 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -473,15 +473,14 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
struct sctp_association **app,
struct sctp_transport **tpp)
{
+ struct sctp_init_chunk *chunkhdr, _chunkhdr;
union sctp_addr saddr;
union sctp_addr daddr;
struct sctp_af *af;
struct sock *sk = NULL;
struct sctp_association *asoc;
struct sctp_transport *transport = NULL;
- struct sctp_init_chunk *chunkhdr;
__u32 vtag = ntohl(sctphdr->vtag);
- int len = skb->len - ((void *)sctphdr - (void *)skb->data);
*app = NULL; *tpp = NULL;
@@ -516,13 +515,16 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
* discard the packet.
*/
if (vtag == 0) {
- chunkhdr = (void *)sctphdr + sizeof(struct sctphdr);
- if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t)
- + sizeof(__be32) ||
+ /* chunk header + first 4 octects of init header */
+ chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) +
+ sizeof(struct sctphdr),
+ sizeof(struct sctp_chunkhdr) +
+ sizeof(__be32), &_chunkhdr);
+ if (!chunkhdr ||
chunkhdr->chunk_hdr.type != SCTP_CID_INIT ||
- ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) {
+ ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag)
goto out;
- }
+
} else if (vtag != asoc->c.peer_vtag) {
goto out;
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 961ee59f696a..f5b45b8b8b16 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -240,12 +240,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct sctp_bind_addr *bp;
struct ipv6_pinfo *np = inet6_sk(sk);
struct sctp_sockaddr_entry *laddr;
- union sctp_addr *baddr = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
struct in6_addr *final_p, final;
__u8 matchlen = 0;
- __u8 bmatchlen;
sctp_scope_t scope;
memset(fl6, 0, sizeof(struct flowi6));
@@ -312,23 +310,37 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
*/
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
- if (!laddr->valid)
+ struct dst_entry *bdst;
+ __u8 bmatchlen;
+
+ if (!laddr->valid ||
+ laddr->state != SCTP_ADDR_SRC ||
+ laddr->a.sa.sa_family != AF_INET6 ||
+ scope > sctp_scope(&laddr->a))
continue;
- if ((laddr->state == SCTP_ADDR_SRC) &&
- (laddr->a.sa.sa_family == AF_INET6) &&
- (scope <= sctp_scope(&laddr->a))) {
- bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
- if (!baddr || (matchlen < bmatchlen)) {
- baddr = &laddr->a;
- matchlen = bmatchlen;
- }
- }
- }
- if (baddr) {
- fl6->saddr = baddr->v6.sin6_addr;
- fl6->fl6_sport = baddr->v6.sin6_port;
+
+ fl6->saddr = laddr->a.v6.sin6_addr;
+ fl6->fl6_sport = laddr->a.v6.sin6_port;
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
+
+ if (!IS_ERR(bdst) &&
+ ipv6_chk_addr(dev_net(bdst->dev),
+ &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
+ if (!IS_ERR_OR_NULL(dst))
+ dst_release(dst);
+ dst = bdst;
+ break;
+ }
+
+ bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+ if (matchlen > bmatchlen)
+ continue;
+
+ if (!IS_ERR_OR_NULL(dst))
+ dst_release(dst);
+ dst = bdst;
+ matchlen = bmatchlen;
}
rcu_read_unlock();
@@ -665,6 +677,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
rcu_read_lock();
opt = rcu_dereference(np->opt);
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 048954eee984..9a647214a91e 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -278,7 +278,6 @@ out:
static int sctp_sock_dump(struct sock *sk, void *p)
{
- struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct sctp_comm_param *commp = p;
struct sk_buff *skb = commp->skb;
struct netlink_callback *cb = commp->cb;
@@ -287,7 +286,9 @@ static int sctp_sock_dump(struct sock *sk, void *p)
int err = 0;
lock_sock(sk);
- list_for_each_entry(assoc, &ep->asocs, asocs) {
+ if (!sctp_sk(sk)->ep)
+ goto release;
+ list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) {
if (cb->args[4] < cb->args[1])
goto next;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8a08f13469c4..92e332e17391 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2454,16 +2454,11 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
* stream sequence number shall be set to 0.
*/
- /* Allocate storage for the negotiated streams if it is not a temporary
- * association.
- */
- if (!asoc->temp) {
- if (sctp_stream_init(asoc, gfp))
- goto clean_up;
+ if (sctp_stream_init(asoc, gfp))
+ goto clean_up;
- if (sctp_assoc_set_id(asoc, gfp))
- goto clean_up;
- }
+ if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
+ goto clean_up;
/* ADDIP Section 4.1 ASCONF Chunk Procedures
*
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 4f5e6cfc7f60..f863b5573e42 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2088,6 +2088,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
}
}
+ /* Set temp so that it won't be added into hashtable */
+ new_asoc->temp = 1;
+
/* Compare the tie_tag in cookie with the verification tag of
* current association.
*/
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f16c8d97b7f3..3a8318e518f1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4622,13 +4622,13 @@ int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize;
hash++, head++) {
- read_lock(&head->lock);
+ read_lock_bh(&head->lock);
sctp_for_each_hentry(epb, &head->chain) {
err = cb(sctp_ep(epb), p);
if (err)
break;
}
- read_unlock(&head->lock);
+ read_unlock_bh(&head->lock);
}
return err;
@@ -4666,9 +4666,8 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
if (err)
return err;
- sctp_transport_get_idx(net, &hti, pos);
- obj = sctp_transport_get_next(net, &hti);
- for (; obj && !IS_ERR(obj); obj = sctp_transport_get_next(net, &hti)) {
+ obj = sctp_transport_get_idx(net, &hti, pos + 1);
+ for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) {
struct sctp_transport *transport = obj;
if (!sctp_transport_hold(transport))
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index c717ef0896aa..33954852f3f8 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -8,6 +8,10 @@ config SMC
The Linux implementation of the SMC-R solution is designed as
a separate socket family SMC.
+ Warning: SMC will expose all memory for remote reads and writes
+ once a connection is established. Don't enable this option except
+ for tightly controlled lab environment.
+
Select this option if you want to run SMC socket applications
config SMC_DIAG
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b6ee21368a6..6793d7348cc8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
.unhash = smc_unhash_sk,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v4_hashinfo,
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto);
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index e41f594a1e1d..03ec058d18df 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -204,7 +204,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
hton24(cclc.qpn, link->roce_qp->qp_num);
cclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]);
cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
cclc.rmbe_alert_token = htonl(conn->alert_token_local);
cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
@@ -256,7 +256,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
hton24(aclc.qpn, link->roce_qp->qp_num);
aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]);
aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
aclc.rmbe_alert_token = htonl(conn->alert_token_local);
aclc.qp_mtu = link->path_mtu;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 65020e93ff21..3ac09a629ea1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -613,19 +613,8 @@ int smc_rmb_create(struct smc_sock *smc)
rmb_desc = NULL;
continue; /* if mapping failed, try smaller one */
}
- rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE,
- &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc) {
- smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
- tmp_bufsize, rmb_desc,
- DMA_FROM_DEVICE);
- kfree(rmb_desc->cpu_addr);
- kfree(rmb_desc);
- rmb_desc = NULL;
- continue;
- }
+ rmb_desc->rkey[SMC_SINGLE_LINK] =
+ lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey;
rmb_desc->used = 1;
write_lock_bh(&lgr->rmbs_lock);
list_add(&rmb_desc->list,
@@ -668,6 +657,7 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+ (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
test_bit(i, lgr->rtokens_used_mask)) {
conn->rtoken_idx = i;
return 0;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 27eb38056a27..b013cb43a327 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -93,7 +93,7 @@ struct smc_buf_desc {
u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
/* mapped address of buffer */
void *cpu_addr; /* virtual address of buffer */
- struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ u32 rkey[SMC_LINKS_PER_LGR_MAX];
/* for rmb only:
* rkey provided to peer
*/
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index cb69ab977cd7..b31715505a35 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -37,24 +37,6 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
* identifier
*/
-int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
- struct ib_mr **mr)
-{
- int rc;
-
- if (*mr)
- return 0; /* already done */
-
- /* obtain unique key -
- * next invocation of get_dma_mr returns a different key!
- */
- *mr = pd->device->get_dma_mr(pd, access_flags);
- rc = PTR_ERR_OR_ZERO(*mr);
- if (IS_ERR(*mr))
- *mr = NULL;
- return rc;
-}
-
static int smc_ib_modify_qp_init(struct smc_link *lnk)
{
struct ib_qp_attr qp_attr;
@@ -210,7 +192,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
{
int rc;
- lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev,
+ IB_PD_UNSAFE_GLOBAL_RKEY);
rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
if (IS_ERR(lnk->roce_pd))
lnk->roce_pd = NULL;
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 7e1f0e24d177..b567152a526d 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -61,8 +61,6 @@ void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
int smc_ib_create_protection_domain(struct smc_link *lnk);
void smc_ib_destroy_queue_pair(struct smc_link *lnk);
int smc_ib_create_queue_pair(struct smc_link *lnk);
-int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
- struct ib_mr **mr);
int smc_ib_ready_link(struct smc_link *lnk);
int smc_ib_modify_qp_rts(struct smc_link *lnk);
int smc_ib_modify_qp_reset(struct smc_link *lnk);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 04ce2c0b660e..ac09ca803296 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
default SUNRPC && INFINIBAND
+ select SG_POOL
help
This option allows the NFS client and server to use RDMA
transports (InfiniBand, iWARP, or RoCE).
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 52da3ce54bb5..b5cb921775a0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1042,8 +1042,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
struct rpc_task *task;
task = rpc_new_task(task_setup_data);
- if (IS_ERR(task))
- goto out;
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
@@ -1053,7 +1051,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
atomic_inc(&task->tk_count);
rpc_execute(task);
-out:
return task;
}
EXPORT_SYMBOL_GPL(rpc_run_task);
@@ -1140,10 +1137,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
* Create an rpc_task to send the data
*/
task = rpc_new_task(&task_setup_data);
- if (IS_ERR(task)) {
- xprt_free_bc_request(req);
- goto out;
- }
task->tk_rqstp = req;
/*
@@ -1158,7 +1151,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
WARN_ON_ONCE(atomic_read(&task->tk_count) != 2);
rpc_execute(task);
-out:
dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
return task;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 5db68b371db2..0cc83839c13c 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -965,11 +965,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
if (task == NULL) {
task = rpc_alloc_task();
- if (task == NULL) {
- rpc_release_calldata(setup_data->callback_ops,
- setup_data->callback_data);
- return ERR_PTR(-ENOMEM);
- }
flags = RPC_TASK_DYNAMIC;
}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a08aeb56b8e4..bc0f5a0ecbdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -702,59 +702,32 @@ found_pool:
return task;
}
-/*
- * Create or destroy enough new threads to make the number
- * of threads the given number. If `pool' is non-NULL, applies
- * only to threads in that pool, otherwise round-robins between
- * all pools. Caller must ensure that mutual exclusion between this and
- * server startup or shutdown.
- *
- * Destroying threads relies on the service threads filling in
- * rqstp->rq_task, which only the nfs ones do. Assumes the serv
- * has been created using svc_create_pooled().
- *
- * Based on code that used to be in nfsd_svc() but tweaked
- * to be pool-aware.
- */
-int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+/* create new threads */
+static int
+svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct svc_rqst *rqstp;
struct task_struct *task;
struct svc_pool *chosen_pool;
- int error = 0;
unsigned int state = serv->sv_nrthreads-1;
int node;
- if (pool == NULL) {
- /* The -1 assumes caller has done a svc_get() */
- nrservs -= (serv->sv_nrthreads-1);
- } else {
- spin_lock_bh(&pool->sp_lock);
- nrservs -= pool->sp_nrthreads;
- spin_unlock_bh(&pool->sp_lock);
- }
-
- /* create new threads */
- while (nrservs > 0) {
+ do {
nrservs--;
chosen_pool = choose_pool(serv, pool, &state);
node = svc_pool_map_get_node(chosen_pool->sp_id);
rqstp = svc_prepare_thread(serv, chosen_pool, node);
- if (IS_ERR(rqstp)) {
- error = PTR_ERR(rqstp);
- break;
- }
+ if (IS_ERR(rqstp))
+ return PTR_ERR(rqstp);
__module_get(serv->sv_ops->svo_module);
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
- error = PTR_ERR(task);
module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
- break;
+ return PTR_ERR(task);
}
rqstp->rq_task = task;
@@ -763,18 +736,103 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
svc_sock_update_bufs(serv);
wake_up_process(task);
- }
+ } while (nrservs > 0);
+
+ return 0;
+}
+
+
+/* destroy old threads */
+static int
+svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct task_struct *task;
+ unsigned int state = serv->sv_nrthreads-1;
+
/* destroy old threads */
- while (nrservs < 0 &&
- (task = choose_victim(serv, pool, &state)) != NULL) {
+ do {
+ task = choose_victim(serv, pool, &state);
+ if (task == NULL)
+ break;
send_sig(SIGINT, task, 1);
nrservs++;
+ } while (nrservs < 0);
+
+ return 0;
+}
+
+/*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number. If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools. Caller must ensure that mutual exclusion between this and
+ * server startup or shutdown.
+ *
+ * Destroying threads relies on the service threads filling in
+ * rqstp->rq_task, which only the nfs ones do. Assumes the serv
+ * has been created using svc_create_pooled().
+ *
+ * Based on code that used to be in nfsd_svc() but tweaked
+ * to be pool-aware.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
}
- return error;
+ if (nrservs > 0)
+ return svc_start_kthreads(serv, pool, nrservs);
+ if (nrservs < 0)
+ return svc_signal_kthreads(serv, pool, nrservs);
+ return 0;
}
EXPORT_SYMBOL_GPL(svc_set_num_threads);
+/* destroy old threads */
+static int
+svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct task_struct *task;
+ unsigned int state = serv->sv_nrthreads-1;
+
+ /* destroy old threads */
+ do {
+ task = choose_victim(serv, pool, &state);
+ if (task == NULL)
+ break;
+ kthread_stop(task);
+ nrservs++;
+ } while (nrservs < 0);
+ return 0;
+}
+
+int
+svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
+ }
+
+ if (nrservs > 0)
+ return svc_start_kthreads(serv, pool, nrservs);
+ if (nrservs < 0)
+ return svc_stop_kthreads(serv, pool, nrservs);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+
/*
* Called from a server thread as it's exiting. Caller must hold the "service
* mutex" for the service.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 1f7082144e01..e34f4ee7f2b6 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -807,7 +807,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
EXPORT_SYMBOL_GPL(xdr_init_decode);
/**
- * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer from which to decode data
* @pages: list of pages to decode into
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index b530a2852ba8..3e63c5e97ebe 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -651,6 +651,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
+EXPORT_SYMBOL_GPL(xprt_force_disconnect);
/**
* xprt_conditional_disconnect - force a transport to disconnect
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa42c50f..c1ae8142ab73 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
- module.o
+ svc_rdma_rw.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 24fedd4b117e..03f6b5840764 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -119,11 +119,9 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
for (i = 0; i < (reqs << 1); i++) {
rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
- if (!rqst) {
- pr_err("RPC: %s: Failed to create bc rpc_rqst\n",
- __func__);
+ if (!rqst)
goto out_free;
- }
+
dprintk("RPC: %s: new rqst %p\n", __func__, rqst);
rqst->rq_xprt = &r_xprt->rx_xprt;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a044be2d6ad7..694e9b13ecf0 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -494,7 +494,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
}
sge->length = len;
- ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
sge->length, DMA_TO_DEVICE);
req->rl_send_wr.num_sge++;
return true;
@@ -523,7 +523,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
sge[sge_no].addr = rdmab_addr(rb);
sge[sge_no].length = xdr->head[0].iov_len;
sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
sge[sge_no].length, DMA_TO_DEVICE);
/* If there is a Read chunk, the page list is being handled
@@ -781,9 +781,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
return 0;
out_err:
- pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
- PTR_ERR(iptr));
- r_xprt->rx_stats.failed_marshal_count++;
+ if (PTR_ERR(iptr) != -ENOBUFS) {
+ pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
+ PTR_ERR(iptr));
+ r_xprt->rx_stats.failed_marshal_count++;
+ }
return PTR_ERR(iptr);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9f1eba..a4a8f6989ee7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
-static unsigned int min_max_inline = 4096;
-static unsigned int max_max_inline = 65536;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
atomic_t rdma_stat_recv;
atomic_t rdma_stat_read;
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %u\n",
- svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40f0d26..c676ed0efb5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
#undef SVCRDMA_BACKCHANNEL_DEBUG
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ * %0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ * %-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
struct xdr_buf *rcvbuf)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
p = (__be32 *)src->iov_base;
len = src->iov_len;
- xid = rmsgp->rm_xid;
+ xid = *rdma_resp;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: xid=%08x, length=%zu\n",
__func__, be32_to_cpu(xid), len);
pr_info("%s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
pr_info("%s: RPC: %*ph\n",
__func__, (int)len, p);
#endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
goto out_unlock;
memcpy(dst->iov_base, p, len);
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(rdma_resp + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
@@ -90,9 +100,9 @@ out_notfound:
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
*
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
- * instead, does not support chunks, and avoids blocking memory
- * allocation.
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
*
* XXX: There is still an opportunity to block in svc_rdma_send()
* if there are no SQ entries to post the Send. This may occur if
@@ -101,59 +111,36 @@ out_notfound:
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst)
{
- struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
struct svc_rdma_op_ctxt *ctxt;
- struct svc_rdma_req_map *vec;
- struct ib_send_wr send_wr;
int ret;
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
- if (ret)
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* rpcrdma_bc_send_request builds the transport header and
+ * the backchannel RPC message in the same buffer. Thus only
+ * one SGE is needed to send both.
+ */
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+ rqst->rq_snd_buf.len);
+ if (ret < 0)
goto out_err;
ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
if (ret)
goto out_err;
- ctxt = svc_rdma_get_context(rdma);
- ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
- ctxt->count = 1;
-
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = sndbuf->len;
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
- sndbuf->len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
- ret = -EIO;
- goto out_unmap;
- }
- svc_rdma_count_mappings(rdma, ctxt);
-
- memset(&send_wr, 0, sizeof(send_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = 1;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- ret = svc_rdma_send(rdma, &send_wr);
- if (ret) {
- ret = -EIO;
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
+ if (ret)
goto out_unmap;
- }
out_err:
- svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: %s returns %d\n", __func__, ret);
return ret;
out_unmap:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
+ ret = -EIO;
goto out_err;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf0f657..bdcf7d85a3dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,92 +166,3 @@ out_inval:
dprintk("svcrdma: failed to parse transport header\n");
return -EINVAL;
}
-
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err, __be32 *va)
-{
- __be32 *startp = va;
-
- *va++ = rmsgp->rm_xid;
- *va++ = rmsgp->rm_vers;
- *va++ = xprt->sc_fc_credits;
- *va++ = rdma_error;
- *va++ = cpu_to_be32(err);
- if (err == ERR_VERS) {
- *va++ = rpcrdma_version;
- *va++ = rpcrdma_version;
- }
-
- return (int)((unsigned long)va - (unsigned long)startp);
-}
-
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
- unsigned int nsegs;
- __be32 *p;
-
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
- struct rpcrdma_write_array *ary;
-
- /* no read-list */
- rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-
- /* write-array discrim */
- ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-
- /* write-list terminator */
- ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-
- /* reply-array discriminator */
- ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
- int chunks)
-{
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-}
-
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
- int chunk_no,
- __be32 rs_handle,
- __be64 rs_offset,
- u32 write_len)
-{
- struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
- seg->rs_handle = rs_handle;
- seg->rs_offset = rs_offset;
- seg->rs_length = cpu_to_be32(write_len);
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf72a86..27a99bf5b1a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.buflen = head->arg.buflen;
}
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+ __be32 *rdma_argp, int status)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p, *err_msgp;
+ unsigned int length;
+ struct page *page;
+ int ret;
+
+ ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+ if (ret)
+ return;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return;
+ err_msgp = page_address(page);
+
+ p = err_msgp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = xprt->sc_fc_credits;
+ *p++ = rdma_error;
+ if (status == -EPROTONOSUPPORT) {
+ *p++ = err_vers;
+ *p++ = rpcrdma_version;
+ *p++ = rpcrdma_version;
+ } else {
+ *p++ = err_chunk;
+ }
+ length = (unsigned long)p - (unsigned long)err_msgp;
+
+ /* Map transport header; no RPC message payload */
+ ctxt = svc_rdma_get_context(xprt);
+ ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+ if (ret) {
+ dprintk("svcrdma: Error %d mapping send for protocol error\n",
+ ret);
+ return;
+ }
+
+ ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+ if (ret) {
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ }
+}
+
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+ __be32 *rdma_resp)
{
- __be32 *p = (__be32 *)rmsgp;
+ __be32 *p;
if (!xprt->xpt_bc_xprt)
return false;
- if (rmsgp->rm_type != rdma_msg)
+ p = rdma_resp + 3;
+ if (*p++ != rdma_msg)
return false;
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != rmsgp->rm_xid)
+ /* XID sanity */
+ if (*p++ != *rdma_resp)
return false;
/* call direction */
- if (p[8] == cpu_to_be32(RPC_CALL))
+ if (*p == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
- ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+ &rmsgp->rm_xid,
&rqstp->rq_arg);
svc_rdma_put_context(ctxt, 0);
if (ret)
@@ -686,7 +739,7 @@ complete:
return ret;
out_err:
- svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+ svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
svc_rdma_put_context(ctxt, 0);
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000000..0cf620277693
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+ struct list_head rw_list;
+ struct rdma_rw_ctx rw_ctx;
+ int rw_nents;
+ struct sg_table rw_sg_table;
+ struct scatterlist rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+ return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+ rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+
+ ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+ if (ctxt) {
+ list_del(&ctxt->rw_list);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ } else {
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ ctxt = kmalloc(sizeof(*ctxt) +
+ SG_CHUNK_SIZE * sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!ctxt)
+ goto out;
+ INIT_LIST_HEAD(&ctxt->rw_list);
+ }
+
+ ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+ if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+ ctxt->rw_sg_table.sgl)) {
+ kfree(ctxt);
+ ctxt = NULL;
+ }
+out:
+ return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+ list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+ kfree(ctxt);
+ }
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+ struct ib_cqe cc_cqe;
+ struct svcxprt_rdma *cc_rdma;
+ struct list_head cc_rwctxts;
+ int cc_sqecount;
+ enum dma_data_direction cc_dir;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+ struct svc_rdma_chunk_ctxt *cc,
+ enum dma_data_direction dir)
+{
+ cc->cc_rdma = rdma;
+ svc_xprt_get(&rdma->sc_xprt);
+
+ INIT_LIST_HEAD(&cc->cc_rwctxts);
+ cc->cc_sqecount = 0;
+ cc->cc_dir = dir;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+
+ rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, cc->cc_dir);
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ }
+ svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ * - Tracks progress of writing one chunk over all its segments
+ * - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+ /* write state of this chunk */
+ unsigned int wi_seg_off;
+ unsigned int wi_seg_no;
+ unsigned int wi_nsegs;
+ __be32 *wi_segs;
+
+ /* SGL constructor arguments */
+ struct xdr_buf *wi_xdr;
+ unsigned char *wi_base;
+ unsigned int wi_next_off;
+
+ struct svc_rdma_chunk_ctxt wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+ struct svc_rdma_write_info *info;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return info;
+
+ info->wi_seg_off = 0;
+ info->wi_seg_no = 0;
+ info->wi_nsegs = be32_to_cpup(++chunk);
+ info->wi_segs = ++chunk;
+ svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+ return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+ svc_rdma_cc_release(&info->wi_cc);
+ kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_chunk_ctxt *cc =
+ container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_write_info *info =
+ container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ }
+
+ svc_rdma_write_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ * even if one or more WRs are flushed. This is true when posting
+ * an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+ struct ib_send_wr *first_wr, *bad_wr;
+ struct list_head *tmp;
+ struct ib_cqe *cqe;
+ int ret;
+
+ first_wr = NULL;
+ cqe = &cc->cc_cqe;
+ list_for_each(tmp, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+
+ do {
+ if (atomic_sub_return(cc->cc_sqecount,
+ &rdma->sc_sq_avail) > 0) {
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ break;
+ return 0;
+ }
+
+ atomic_inc(&rdma_stat_sq_starve);
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wait_event(rdma->sc_send_wait,
+ atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+ } while (1);
+
+ pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+ /* If even one was posted, there will be a completion. */
+ if (bad_wr != first_wr)
+ return 0;
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+ return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+ sg_set_buf(&sg[0], info->wi_base, len);
+ info->wi_base += len;
+
+ ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+ unsigned int remaining,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ unsigned int sge_no, sge_bytes, page_off, page_no;
+ struct xdr_buf *xdr = info->wi_xdr;
+ struct scatterlist *sg;
+ struct page **page;
+
+ page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+ page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+ page = xdr->pages + page_no;
+ info->wi_next_off += remaining;
+ sg = ctxt->rw_sg_table.sgl;
+ sge_no = 0;
+ do {
+ sge_bytes = min_t(unsigned int, remaining,
+ PAGE_SIZE - page_off);
+ sg_set_page(sg, *page, sge_bytes, page_off);
+
+ remaining -= sge_bytes;
+ sg = sg_next(sg);
+ page_off = 0;
+ sge_no++;
+ page++;
+ } while (remaining);
+
+ ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+ void (*constructor)(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt),
+ unsigned int remaining)
+{
+ struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+ __be32 *seg;
+ int ret;
+
+ cc->cc_cqe.done = svc_rdma_write_done;
+ seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+ do {
+ unsigned int write_len;
+ u32 seg_length, seg_handle;
+ u64 seg_offset;
+
+ if (info->wi_seg_no >= info->wi_nsegs)
+ goto out_overflow;
+
+ seg_handle = be32_to_cpup(seg);
+ seg_length = be32_to_cpup(seg + 1);
+ xdr_decode_hyper(seg + 2, &seg_offset);
+ seg_offset += info->wi_seg_off;
+
+ write_len = min(remaining, seg_length - info->wi_seg_off);
+ ctxt = svc_rdma_get_rw_ctxt(rdma,
+ (write_len >> PAGE_SHIFT) + 2);
+ if (!ctxt)
+ goto out_noctx;
+
+ constructor(info, write_len, ctxt);
+ ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, 0, seg_offset,
+ seg_handle, DMA_TO_DEVICE);
+ if (ret < 0)
+ goto out_initerr;
+
+ list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+ cc->cc_sqecount += ret;
+ if (write_len == seg_length - info->wi_seg_off) {
+ seg += 4;
+ info->wi_seg_no++;
+ info->wi_seg_off = 0;
+ } else {
+ info->wi_seg_off += write_len;
+ }
+ remaining -= write_len;
+ } while (remaining);
+
+ return 0;
+
+out_overflow:
+ dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+ info->wi_nsegs);
+ return -E2BIG;
+
+out_noctx:
+ dprintk("svcrdma: no R/W ctxs available\n");
+ return -ENOMEM;
+
+out_initerr:
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+ return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+ struct kvec *vec)
+{
+ info->wi_base = vec->iov_base;
+ return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+ vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+ struct xdr_buf *xdr)
+{
+ info->wi_xdr = xdr;
+ info->wi_next_off = 0;
+ return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+ xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Write chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+ struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int ret;
+
+ if (!xdr->page_len)
+ return 0;
+
+ info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return xdr->page_len;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Reply chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+ bool writelist, struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int consumed, ret;
+
+ info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed = xdr->head[0].iov_len;
+
+ /* Send the page list in the Reply chunk only if the
+ * client did not provide Write chunks.
+ */
+ if (!writelist && xdr->page_len) {
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->page_len;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->tail[0].iov_len;
+ }
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return consumed;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b16d09..1736337f3a55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
@@ -40,6 +41,63 @@
* Author: Tom Tucker <tom@opengridcomputing.com>
*/
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ * successfully, or get flushed. Either way, the Send completion
+ * handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ * the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/spinlock.h>
@@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len)
return (len & 3) ? (4 - (len & 3)) : 0;
}
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec,
- bool write_chunk_present)
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
{
- int sge_no;
- u32 sge_bytes;
- u32 page_bytes;
- u32 page_off;
- int page_no;
-
- if (xdr->len !=
- (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
- return -EIO;
- }
+ unsigned int nsegs;
+ __be32 *p;
- /* Skip the first sge, this is for the RPCRDMA header */
- sge_no = 1;
+ p = rdma_resp;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;
- /* Head SGE */
- vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
- vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
- sge_no++;
-
- /* pages SGE */
- page_no = 0;
- page_bytes = xdr->page_len;
- page_off = xdr->page_base;
- while (page_bytes) {
- vec->sge[sge_no].iov_base =
- page_address(xdr->pages[page_no]) + page_off;
- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
- page_bytes -= sge_bytes;
- vec->sge[sge_no].iov_len = sge_bytes;
-
- sge_no++;
- page_no++;
- page_off = 0; /* reset for next time through loop */
+ /* Skip Write list. */
+ while (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
}
- /* Tail SGE */
- if (xdr->tail[0].iov_len) {
- unsigned char *base = xdr->tail[0].iov_base;
- size_t len = xdr->tail[0].iov_len;
- u32 xdr_pad = xdr_padsize(xdr->page_len);
+ /* Skip Reply chunk. */
+ if (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
- if (write_chunk_present && xdr_pad) {
- base += xdr_pad;
- len -= xdr_pad;
- }
+ return (unsigned long)p - (unsigned long)rdma_resp;
+}
- if (len) {
- vec->sge[sge_no].iov_base = base;
- vec->sge[sge_no].iov_len = len;
- sge_no++;
+/* One Write chunk is copied from Call transport header to Reply
+ * transport header. Each segment's length field is updated to
+ * reflect number of bytes consumed in the segment.
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+ unsigned int remaining)
+{
+ unsigned int i, nsegs;
+ u32 seg_len;
+
+ /* Write list discriminator */
+ *dst++ = *src++;
+
+ /* number of segments in this chunk */
+ nsegs = be32_to_cpup(src);
+ *dst++ = *src++;
+
+ for (i = nsegs; i; i--) {
+ /* segment's RDMA handle */
+ *dst++ = *src++;
+
+ /* bytes returned in this segment */
+ seg_len = be32_to_cpu(*src);
+ if (remaining >= seg_len) {
+ /* entire segment was consumed */
+ *dst = *src;
+ remaining -= seg_len;
+ } else {
+ /* segment only partly filled */
+ *dst = cpu_to_be32(remaining);
+ remaining = 0;
}
- }
+ dst++; src++;
- dprintk("svcrdma: %s: sge_no %d page_no %d "
- "page_base %u page_len %u head_len %zu tail_len %zu\n",
- __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
- xdr->head[0].iov_len, xdr->tail[0].iov_len);
+ /* segment's RDMA offset */
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
- vec->count = sge_no;
- return 0;
+ return nsegs;
}
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- u32 xdr_off, size_t len, int dir)
+/* The client provided a Write list in the Call message. Fill in
+ * the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ * - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+ unsigned int consumed)
{
- struct page *page;
- dma_addr_t dma_addr;
- if (xdr_off < xdr->head[0].iov_len) {
- /* This offset is in the head */
- xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
- page = virt_to_page(xdr->head[0].iov_base);
- } else {
- xdr_off -= xdr->head[0].iov_len;
- if (xdr_off < xdr->page_len) {
- /* This offset is in the page list */
- xdr_off += xdr->page_base;
- page = xdr->pages[xdr_off >> PAGE_SHIFT];
- xdr_off &= ~PAGE_MASK;
- } else {
- /* This offset is in the tail */
- xdr_off -= xdr->page_len;
- xdr_off += (unsigned long)
- xdr->tail[0].iov_base & ~PAGE_MASK;
- page = virt_to_page(xdr->tail[0].iov_base);
- }
+ unsigned int nsegs;
+ __be32 *p, *q;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+ q = wr_ch;
+ while (*q != xdr_zero) {
+ nsegs = xdr_encode_write_chunk(p, q, consumed);
+ q += 2 + nsegs * rpcrdma_segment_maxsz;
+ p += 2 + nsegs * rpcrdma_segment_maxsz;
+ consumed = 0;
}
- dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
- min_t(size_t, PAGE_SIZE, len), dir);
- return dma_addr;
+
+ /* Terminate Write list */
+ *p++ = xdr_zero;
+
+ /* Reply chunk discriminator; may be replaced later */
+ *p = xdr_zero;
+}
+
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+ unsigned int consumed)
+{
+ __be32 *p;
+
+ /* Find the Reply chunk in the Reply's xprt header.
+ * RPC-over-RDMA V1 replies never have a Read list.
+ */
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+ /* Skip past Write list */
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+
+ xdr_encode_write_chunk(p, rp_ch, consumed);
}
/* Parse the RPC Call's transport header.
*/
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array **write,
- struct rpcrdma_write_array **reply)
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
+ __be32 **write, __be32 **reply)
{
__be32 *p;
- p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
+ p = rdma_argp + rpcrdma_fixed_maxsz;
/* Read list */
while (*p++ != xdr_zero)
@@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
/* Write list */
if (*p != xdr_zero) {
- *write = (struct rpcrdma_write_array *)p;
+ *write = p;
while (*p++ != xdr_zero)
p += 1 + be32_to_cpu(*p) * 4;
} else {
@@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
/* Reply chunk */
if (*p != xdr_zero)
- *reply = (struct rpcrdma_write_array *)p;
+ *reply = p;
else
*reply = NULL;
}
@@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
* Invalidate, and responder chooses one rkey to invalidate.
*
* Find a candidate rkey to invalidate when sending a reply. Picks the
- * first rkey it finds in the chunks lists.
+ * first R_key it finds in the chunk lists.
*
* Returns zero if RPC's chunk lists are empty.
*/
-static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
- struct rpcrdma_write_array *wr_ary,
- struct rpcrdma_write_array *rp_ary)
+static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
+ __be32 *wr_lst, __be32 *rp_ch)
{
- struct rpcrdma_read_chunk *rd_ary;
- struct rpcrdma_segment *arg_ch;
+ __be32 *p;
- rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
- if (rd_ary->rc_discrim != xdr_zero)
- return be32_to_cpu(rd_ary->rc_target.rs_handle);
+ p = rdma_argp + rpcrdma_fixed_maxsz;
+ if (*p != xdr_zero)
+ p += 2;
+ else if (wr_lst && be32_to_cpup(wr_lst + 1))
+ p = wr_lst + 2;
+ else if (rp_ch && be32_to_cpup(rp_ch + 1))
+ p = rp_ch + 2;
+ else
+ return 0;
+ return be32_to_cpup(p);
+}
- if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
- arg_ch = &wr_ary->wc_array[0].wc_target;
- return be32_to_cpu(arg_ch->rs_handle);
- }
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * is used during completion to DMA-unmap this memory, and
+ * it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ unsigned int sge_no,
+ unsigned char *base,
+ unsigned int len)
+{
+ unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+ struct ib_device *dev = rdma->sc_cm_id->device;
+ dma_addr_t dma_addr;
- if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
- arg_ch = &rp_ary->wc_array[0].wc_target;
- return be32_to_cpu(arg_ch->rs_handle);
- }
+ dma_addr = ib_dma_map_page(dev, virt_to_page(base),
+ offset, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ return -EIO;
+ ctxt->sge[sge_no].addr = dma_addr;
+ ctxt->sge[sge_no].length = len;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+ svc_rdma_count_mappings(rdma, ctxt);
return 0;
}
-/* Assumptions:
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
- */
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
- u32 rmr, u64 to,
- u32 xdr_off, int write_len,
- struct svc_rdma_req_map *vec)
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ unsigned int sge_no,
+ struct page *page,
+ unsigned int offset,
+ unsigned int len)
{
- struct ib_rdma_wr write_wr;
- struct ib_sge *sge;
- int xdr_sge_no;
- int sge_no;
- int sge_bytes;
- int sge_off;
- int bc;
- struct svc_rdma_op_ctxt *ctxt;
+ struct ib_device *dev = rdma->sc_cm_id->device;
+ dma_addr_t dma_addr;
- if (vec->count > RPCSVC_MAXPAGES) {
- pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+ dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(dev, dma_addr))
return -EIO;
- }
- dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
- "write_len=%d, vec->sge=%p, vec->count=%lu\n",
- rmr, (unsigned long long)to, xdr_off,
- write_len, vec->sge, vec->count);
+ ctxt->sge[sge_no].addr = dma_addr;
+ ctxt->sge[sge_no].length = len;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+ svc_rdma_count_mappings(rdma, ctxt);
+ return 0;
+}
- ctxt = svc_rdma_get_context(xprt);
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ * %0 if the header is DMA mapped,
+ * %-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ __be32 *rdma_resp,
+ unsigned int len)
+{
ctxt->direction = DMA_TO_DEVICE;
- sge = ctxt->sge;
-
- /* Find the SGE associated with xdr_off */
- for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
- xdr_sge_no++) {
- if (vec->sge[xdr_sge_no].iov_len > bc)
- break;
- bc -= vec->sge[xdr_sge_no].iov_len;
- }
-
- sge_off = bc;
- bc = write_len;
- sge_no = 0;
-
- /* Copy the remaining SGE */
- while (bc != 0) {
- sge_bytes = min_t(size_t,
- bc, vec->sge[xdr_sge_no].iov_len-sge_off);
- sge[sge_no].length = sge_bytes;
- sge[sge_no].addr =
- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- sge[sge_no].addr))
- goto err;
- svc_rdma_count_mappings(xprt, ctxt);
- sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
- ctxt->count++;
- sge_off = 0;
- sge_no++;
- xdr_sge_no++;
- if (xdr_sge_no > vec->count) {
- pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
- goto err;
- }
- bc -= sge_bytes;
- if (sge_no == xprt->sc_max_sge)
- break;
- }
-
- /* Prepare WRITE WR */
- memset(&write_wr, 0, sizeof write_wr);
- ctxt->cqe.done = svc_rdma_wc_write;
- write_wr.wr.wr_cqe = &ctxt->cqe;
- write_wr.wr.sg_list = &sge[0];
- write_wr.wr.num_sge = sge_no;
- write_wr.wr.opcode = IB_WR_RDMA_WRITE;
- write_wr.wr.send_flags = IB_SEND_SIGNALED;
- write_wr.rkey = rmr;
- write_wr.remote_addr = to;
-
- /* Post It */
- atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr.wr))
- goto err;
- return write_len - bc;
- err:
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
- return -EIO;
+ ctxt->pages[0] = virt_to_page(rdma_resp);
+ ctxt->count = 1;
+ return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
}
-noinline
-static int send_write_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_write_array *wr_ary,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rqst *rqstp,
- struct svc_rdma_req_map *vec)
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added.
+ *
+ * Returns the number of sge elements loaded on success, or
+ * a negative errno on failure.
+ */
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct xdr_buf *xdr, __be32 *wr_lst)
{
- u32 xfer_len = rqstp->rq_res.page_len;
- int write_len;
- u32 xdr_off;
- int chunk_off;
- int chunk_no;
- int nchunks;
- struct rpcrdma_write_array *res_ary;
+ unsigned int len, sge_no, remaining, page_off;
+ struct page **ppages;
+ unsigned char *base;
+ u32 xdr_pad;
int ret;
- res_ary = (struct rpcrdma_write_array *)
- &rdma_resp->rm_body.rm_chunks[1];
-
- /* Write chunks start at the pagelist */
- nchunks = be32_to_cpu(wr_ary->wc_nchunks);
- for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
- xfer_len && chunk_no < nchunks;
- chunk_no++) {
- struct rpcrdma_segment *arg_ch;
- u64 rs_offset;
-
- arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
-
- /* Prepare the response chunk given the length actually
- * written */
- xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
- arg_ch->rs_handle,
- arg_ch->rs_offset,
- write_len);
- chunk_off = 0;
- while (write_len) {
- ret = send_write(xprt, rqstp,
- be32_to_cpu(arg_ch->rs_handle),
- rs_offset + chunk_off,
- xdr_off,
- write_len,
- vec);
- if (ret <= 0)
- goto out_err;
- chunk_off += ret;
- xdr_off += ret;
- xfer_len -= ret;
- write_len -= ret;
+ sge_no = 1;
+
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+ xdr->head[0].iov_base,
+ xdr->head[0].iov_len);
+ if (ret < 0)
+ return ret;
+
+ /* If a Write chunk is present, the xdr_buf's page list
+ * is not included inline. However the Upper Layer may
+ * have added XDR padding in the tail buffer, and that
+ * should not be included inline.
+ */
+ if (wr_lst) {
+ base = xdr->tail[0].iov_base;
+ len = xdr->tail[0].iov_len;
+ xdr_pad = xdr_padsize(xdr->page_len);
+
+ if (len && xdr_pad) {
+ base += xdr_pad;
+ len -= xdr_pad;
}
+
+ goto tail;
+ }
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_off = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - page_off, remaining);
+
+ ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+ *ppages++, page_off, len);
+ if (ret < 0)
+ return ret;
+
+ remaining -= len;
+ page_off = 0;
}
- /* Update the req with the number of chunks actually used */
- svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
- return rqstp->rq_res.page_len;
+ base = xdr->tail[0].iov_base;
+ len = xdr->tail[0].iov_len;
+tail:
+ if (len) {
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+ if (ret < 0)
+ return ret;
+ }
-out_err:
- pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
- return -EIO;
+ return sge_no - 1;
}
-noinline
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_write_array *rp_ary,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rqst *rqstp,
- struct svc_rdma_req_map *vec)
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *ctxt)
{
- u32 xfer_len = rqstp->rq_res.len;
- int write_len;
- u32 xdr_off;
- int chunk_no;
- int chunk_off;
- int nchunks;
- struct rpcrdma_segment *ch;
- struct rpcrdma_write_array *res_ary;
- int ret;
+ int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
- /* XXX: need to fix when reply lists occur with read-list and or
- * write-list */
- res_ary = (struct rpcrdma_write_array *)
- &rdma_resp->rm_body.rm_chunks[2];
-
- /* xdr offset starts at RPC message */
- nchunks = be32_to_cpu(rp_ary->wc_nchunks);
- for (xdr_off = 0, chunk_no = 0;
- xfer_len && chunk_no < nchunks;
- chunk_no++) {
- u64 rs_offset;
- ch = &rp_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
-
- /* Prepare the reply chunk given the length actually
- * written */
- xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
- ch->rs_handle, ch->rs_offset,
- write_len);
- chunk_off = 0;
- while (write_len) {
- ret = send_write(xprt, rqstp,
- be32_to_cpu(ch->rs_handle),
- rs_offset + chunk_off,
- xdr_off,
- write_len,
- vec);
- if (ret <= 0)
- goto out_err;
- chunk_off += ret;
- xdr_off += ret;
- xfer_len -= ret;
- write_len -= ret;
- }
+ ctxt->count += pages;
+ for (i = 0; i < pages; i++) {
+ ctxt->pages[i + 1] = rqstp->rq_respages[i];
+ rqstp->rq_respages[i] = NULL;
}
- /* Update the req with the number of chunks actually used */
- svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+}
- return rqstp->rq_res.len;
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ * %0 if the Send* was posted successfully,
+ * %-ENOTCONN if the connection was lost or dropped,
+ * %-EINVAL if there was a problem with the Send we built,
+ * %-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt, int num_sge,
+ u32 inv_rkey)
+{
+ struct ib_send_wr *send_wr = &ctxt->send_wr;
-out_err:
- pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
- return -EIO;
+ dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+
+ send_wr->next = NULL;
+ ctxt->cqe.done = svc_rdma_wc_send;
+ send_wr->wr_cqe = &ctxt->cqe;
+ send_wr->sg_list = ctxt->sge;
+ send_wr->num_sge = num_sge;
+ send_wr->send_flags = IB_SEND_SIGNALED;
+ if (inv_rkey) {
+ send_wr->opcode = IB_WR_SEND_WITH_INV;
+ send_wr->ex.invalidate_rkey = inv_rkey;
+ } else {
+ send_wr->opcode = IB_WR_SEND;
+ }
+
+ return svc_rdma_send(rdma, send_wr);
}
-/* This function prepares the portion of the RPCRDMA message to be
- * sent in the RDMA_SEND. This function is called after data sent via
- * RDMA has already been transmitted. There are three cases:
- * - The RPCRDMA header, RPC header, and payload are all sent in a
- * single RDMA_SEND. This is the "inline" case.
- * - The RPCRDMA header and some portion of the RPC header and data
- * are sent via this RDMA_SEND and another portion of the data is
- * sent via RDMA.
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
- * header and data are all transmitted via RDMA.
- * In all three cases, this function prepares the RPCRDMA header in
- * sge[0], the 'type' parameter indicates the type to place in the
- * RPCRDMA header, and the 'byte_count' field indicates how much of
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
- * to send is zero in the XDR.
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sge[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the ctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
*/
-static int send_reply(struct svcxprt_rdma *rdma,
- struct svc_rqst *rqstp,
- struct page *page,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rdma_req_map *vec,
- int byte_count,
- u32 inv_rkey)
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+ __be32 *rdma_argp, __be32 *rdma_resp,
+ struct svc_rqst *rqstp,
+ __be32 *wr_lst, __be32 *rp_ch)
{
struct svc_rdma_op_ctxt *ctxt;
- struct ib_send_wr send_wr;
- u32 xdr_off;
- int sge_no;
- int sge_bytes;
- int page_no;
- int pages;
- int ret = -EIO;
-
- /* Prepare the context */
+ u32 inv_rkey;
+ int ret;
+
+ dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
+ (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
+ rqstp->rq_res.head[0].iov_len,
+ rqstp->rq_res.page_len,
+ rqstp->rq_res.tail[0].iov_len);
+
ctxt = svc_rdma_get_context(rdma);
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->pages[0] = page;
- ctxt->count = 1;
- /* Prepare the SGE for the RPCRDMA Header */
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length =
- svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
- ctxt->sge[0].length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
+ svc_rdma_reply_hdr_len(rdma_resp));
+ if (ret < 0)
goto err;
- svc_rdma_count_mappings(rdma, ctxt);
-
- ctxt->direction = DMA_TO_DEVICE;
- /* Map the payload indicated by 'byte_count' */
- xdr_off = 0;
- for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
- sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
- byte_count -= sge_bytes;
- ctxt->sge[sge_no].addr =
- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(rdma->sc_cm_id->device,
- ctxt->sge[sge_no].addr))
+ if (!rp_ch) {
+ ret = svc_rdma_map_reply_msg(rdma, ctxt,
+ &rqstp->rq_res, wr_lst);
+ if (ret < 0)
goto err;
- svc_rdma_count_mappings(rdma, ctxt);
- ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[sge_no].length = sge_bytes;
}
- if (byte_count != 0) {
- pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+
+ svc_rdma_save_io_pages(rqstp, ctxt);
+
+ inv_rkey = 0;
+ if (rdma->sc_snd_w_inv)
+ inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
+ if (ret)
goto err;
- }
- /* Save all respages in the ctxt and remove them from the
- * respages array. They are our pages until the I/O
- * completes.
+ return 0;
+
+err:
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ return ret;
+}
+
+/* Given the client-provided Write and Reply chunks, the server was not
+ * able to form a complete reply. Return an RDMA_ERROR message so the
+ * client can retire this RPC transaction. As above, the Send completion
+ * routine releases payload pages that were part of a previous RDMA Write.
+ *
+ * Remote Invalidation is skipped for simplicity.
+ */
+static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+ __be32 *rdma_resp, struct svc_rqst *rqstp)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p;
+ int ret;
+
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* Replace the original transport header with an
+ * RDMA_ERROR response. XID etc are preserved.
*/
- pages = rqstp->rq_next_page - rqstp->rq_respages;
- for (page_no = 0; page_no < pages; page_no++) {
- ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
- ctxt->count++;
- rqstp->rq_respages[page_no] = NULL;
- }
- rqstp->rq_next_page = rqstp->rq_respages + 1;
+ p = rdma_resp + 3;
+ *p++ = rdma_error;
+ *p = err_chunk;
- if (sge_no > rdma->sc_max_sge) {
- pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
+ if (ret < 0)
goto err;
- }
- memset(&send_wr, 0, sizeof send_wr);
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = sge_no;
- if (inv_rkey) {
- send_wr.opcode = IB_WR_SEND_WITH_INV;
- send_wr.ex.invalidate_rkey = inv_rkey;
- } else
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
- ret = svc_rdma_send(rdma, &send_wr);
+ svc_rdma_save_io_pages(rqstp, ctxt);
+
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
if (ret)
goto err;
return 0;
- err:
+err:
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
return ret;
@@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
{
}
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ * %0 if an RPC reply has been successfully posted,
+ * %-ENOMEM if a resource shortage occurred (connection is lost),
+ * %-ENOTCONN if posting failed (connection is lost).
+ */
int svc_rdma_sendto(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = rqstp->rq_xprt;
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
- struct rpcrdma_msg *rdma_argp;
- struct rpcrdma_msg *rdma_resp;
- struct rpcrdma_write_array *wr_ary, *rp_ary;
- int ret;
- int inline_bytes;
+ __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+ struct xdr_buf *xdr = &rqstp->rq_res;
struct page *res_page;
- struct svc_rdma_req_map *vec;
- u32 inv_rkey;
- __be32 *p;
-
- dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+ int ret;
- /* Get the RDMA request header. The receive logic always
- * places this at the start of page 0.
+ /* Find the call's chunk lists to decide how to send the reply.
+ * Receive places the Call's xprt header at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
-
- inv_rkey = 0;
- if (rdma->sc_snd_w_inv)
- inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
- /* Build an req vec for the XDR */
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
- if (ret)
- goto err0;
- inline_bytes = rqstp->rq_res.len;
+ dprintk("svcrdma: preparing response for XID 0x%08x\n",
+ be32_to_cpup(rdma_argp));
/* Create the RDMA response header. xprt->xpt_mutex,
* acquired in svc_send(), serializes RPC replies. The
@@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
rdma_resp = page_address(res_page);
- p = &rdma_resp->rm_xid;
- *p++ = rdma_argp->rm_xid;
- *p++ = rdma_argp->rm_vers;
+ p = rdma_resp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+ *p++ = rp_ch ? rdma_nomsg : rdma_msg;
/* Start with empty chunks */
*p++ = xdr_zero;
*p++ = xdr_zero;
*p = xdr_zero;
- /* Send any write-chunk data and build resp write-list */
- if (wr_ary) {
- ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+ if (wr_lst) {
+ /* XXX: Presume the client sent only one Write chunk */
+ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
if (ret < 0)
- goto err1;
- inline_bytes -= ret + xdr_padsize(ret);
+ goto err2;
+ svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
}
-
- /* Send any reply-list data and update resp reply-list */
- if (rp_ary) {
- ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
+ if (rp_ch) {
+ ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
if (ret < 0)
- goto err1;
- inline_bytes -= ret;
+ goto err2;
+ svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
}
- /* Post a fresh Receive buffer _before_ sending the reply */
ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
if (ret)
goto err1;
-
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
- inline_bytes, inv_rkey);
+ ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+ wr_lst, rp_ch);
if (ret < 0)
goto err0;
+ return 0;
- svc_rdma_put_req_map(rdma, vec);
- dprintk("svcrdma: send_reply returns %d\n", ret);
- return ret;
+ err2:
+ if (ret != -E2BIG)
+ goto err1;
+
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+ if (ret)
+ goto err1;
+ ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
+ if (ret < 0)
+ goto err0;
+ return 0;
err1:
put_page(res_page);
err0:
- svc_rdma_put_req_map(rdma, vec);
pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
ret);
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
return -ENOTCONN;
}
-
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
- int status)
-{
- struct ib_send_wr err_wr;
- struct page *p;
- struct svc_rdma_op_ctxt *ctxt;
- enum rpcrdma_errcode err;
- __be32 *va;
- int length;
- int ret;
-
- ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
- if (ret)
- return;
-
- p = alloc_page(GFP_KERNEL);
- if (!p)
- return;
- va = page_address(p);
-
- /* XDR encode an error reply */
- err = ERR_CHUNK;
- if (status == -EPROTONOSUPPORT)
- err = ERR_VERS;
- length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-
- ctxt = svc_rdma_get_context(xprt);
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->count = 1;
- ctxt->pages[0] = p;
-
- /* Prepare SGE for local address */
- ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = length;
- ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
- p, 0, length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
- dprintk("svcrdma: Error mapping buffer for protocol error\n");
- svc_rdma_put_context(ctxt, 1);
- return;
- }
- svc_rdma_count_mappings(xprt, ctxt);
-
- /* Prepare SEND WR */
- memset(&err_wr, 0, sizeof(err_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- err_wr.wr_cqe = &ctxt->cqe;
- err_wr.sg_list = ctxt->sge;
- err_wr.num_sge = 1;
- err_wr.opcode = IB_WR_SEND;
- err_wr.send_flags = IB_SEND_SIGNALED;
-
- /* Post It */
- ret = svc_rdma_send(xprt, &err_wr);
- if (ret) {
- dprintk("svcrdma: Error %d posting send for protocol error\n",
- ret);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 1);
- }
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fc8f14c7bfec..a9d9cb1ba4c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
}
}
-static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
-{
- struct svc_rdma_req_map *map;
-
- map = kmalloc(sizeof(*map), flags);
- if (map)
- INIT_LIST_HEAD(&map->free);
- return map;
-}
-
-static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
-{
- unsigned int i;
-
- /* One for each receive buffer on this connection. */
- i = xprt->sc_max_requests;
-
- while (i--) {
- struct svc_rdma_req_map *map;
-
- map = alloc_req_map(GFP_KERNEL);
- if (!map) {
- dprintk("svcrdma: No memory for request map\n");
- return false;
- }
- list_add(&map->free, &xprt->sc_maps);
- }
- return true;
-}
-
-struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
-{
- struct svc_rdma_req_map *map = NULL;
-
- spin_lock(&xprt->sc_map_lock);
- if (list_empty(&xprt->sc_maps))
- goto out_empty;
-
- map = list_first_entry(&xprt->sc_maps,
- struct svc_rdma_req_map, free);
- list_del_init(&map->free);
- spin_unlock(&xprt->sc_map_lock);
-
-out:
- map->count = 0;
- return map;
-
-out_empty:
- spin_unlock(&xprt->sc_map_lock);
-
- /* Pre-allocation amount was incorrect */
- map = alloc_req_map(GFP_NOIO);
- if (map)
- goto out;
-
- WARN_ONCE(1, "svcrdma: empty request map list?\n");
- return NULL;
-}
-
-void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
- struct svc_rdma_req_map *map)
-{
- spin_lock(&xprt->sc_map_lock);
- list_add(&map->free, &xprt->sc_maps);
- spin_unlock(&xprt->sc_map_lock);
-}
-
-static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
-{
- while (!list_empty(&xprt->sc_maps)) {
- struct svc_rdma_req_map *map;
-
- map = list_first_entry(&xprt->sc_maps,
- struct svc_rdma_req_map, free);
- list_del(&map->free);
- kfree(map);
- }
-}
-
/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
@@ -474,24 +395,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
}
/**
- * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
- * @cq: completion queue
- * @wc: completed WR
- *
- */
-void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct svc_rdma_op_ctxt *ctxt;
-
- svc_rdma_send_wc_common_put(cq, wc, "write");
-
- ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
-}
-
-/**
* svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
* @cq: completion queue
* @wc: completed WR
@@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
- INIT_LIST_HEAD(&cma_xprt->sc_maps);
+ INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
spin_lock_init(&cma_xprt->sc_ctxt_lock);
- spin_lock_init(&cma_xprt->sc_map_lock);
+ spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
/*
* Note that this implies that the underlying transport support
@@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt, newxprt->sc_cm_id);
dev = newxprt->sc_cm_id->device;
+ newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
@@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
- newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ newxprt->sc_sq_depth = newxprt->sc_rq_depth;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
- if (!svc_rdma_prealloc_maps(newxprt))
- goto errout;
/*
* Limit ORD based on client limit, local device limit, and
@@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
+ qp_attr.port_num = newxprt->sc_cm_id->port_num;
+ qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
@@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work)
}
rdma_dealloc_frmr_q(rdma);
+ svc_rdma_destroy_rw_ctxts(rdma);
svc_rdma_destroy_ctxts(rdma);
- svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index c717f5410776..62ecbccd9748 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -66,8 +66,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
- int xprt_rdma_pad_optimize = 0;
+unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+int xprt_rdma_pad_optimize;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -396,7 +396,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap);
if (rc)
goto out1;
@@ -457,19 +457,33 @@ out1:
return ERR_PTR(rc);
}
-/*
- * Close a connection, during shutdown or timeout/reconnect
+/**
+ * xprt_rdma_close - Close down RDMA connection
+ * @xprt: generic transport to be closed
+ *
+ * Called during transport shutdown reconnect, or device
+ * removal. Caller holds the transport's write lock.
*/
static void
xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ dprintk("RPC: %s: closing xprt %p\n", __func__, xprt);
- dprintk("RPC: %s: closing\n", __func__);
- if (r_xprt->rx_ep.rep_connected > 0)
+ if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
+ xprt_clear_connected(xprt);
+ rpcrdma_ia_remove(ia);
+ return;
+ }
+ if (ep->rep_connected == -ENODEV)
+ return;
+ if (ep->rep_connected > 0)
xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ep_disconnect(ep, ia);
}
static void
@@ -484,6 +498,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
dprintk("RPC: %s: %u\n", __func__, port);
}
+/**
+ * xprt_rdma_timer - invoked when an RPC times out
+ * @xprt: controlling RPC transport
+ * @task: RPC task that timed out
+ *
+ * Invoked when the transport is still connected, but an RPC
+ * retransmit timeout occurs.
+ *
+ * Since RDMA connections don't have a keep-alive, forcibly
+ * disconnect and retry to connect. This drives full
+ * detection of the network path, and retransmissions of
+ * all pending RPCs.
+ */
+static void
+xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt);
+
+ xprt_force_disconnect(xprt);
+}
+
static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
@@ -659,6 +694,8 @@ xprt_rdma_free(struct rpc_task *task)
* xprt_rdma_send_request - marshal and send an RPC request
* @task: RPC task with an RPC message in rq_snd_buf
*
+ * Caller holds the transport's write lock.
+ *
* Return values:
* 0: The request has been sent
* ENOTCONN: Caller needs to invoke connect logic then call again
@@ -685,6 +722,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
+ if (!xprt_connected(xprt))
+ goto drop_connection;
+
/* On retransmit, remove any previously registered chunks */
if (unlikely(!list_empty(&req->rl_registered)))
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -776,6 +816,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.alloc_slot = xprt_alloc_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
+ .timer = xprt_rdma_timer,
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
.set_port = xprt_rdma_set_port,
.connect = xprt_rdma_connect,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3b332b395045..3dbce9ac4327 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,7 +53,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
-#include <linux/module.h> /* try_module_get()/module_put() */
+
#include <rdma/ib_cm.h>
#include "xprt_rdma.h"
@@ -69,8 +69,11 @@
/*
* internal functions
*/
+static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf);
+static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
-static struct workqueue_struct *rpcrdma_receive_wq;
+static struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
int
rpcrdma_alloc_wq(void)
@@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
- ib_dma_sync_single_for_cpu(rep->rr_device,
+ ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
@@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
__func__, ep);
complete(&ia->ri_done);
break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ pr_info("rpcrdma: removing device for %pIS:%u\n",
+ sap, rpc_get_port(sap));
+#endif
+ set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
+ ep->rep_connected = -ENODEV;
+ xprt_force_disconnect(&xprt->rx_xprt);
+ wait_for_completion(&ia->ri_remove_done);
+
+ ia->ri_id = NULL;
+ ia->ri_pd = NULL;
+ ia->ri_device = NULL;
+ /* Return 1 to ensure the core destroys the id. */
+ return 1;
case RDMA_CM_EVENT_ESTABLISHED:
connstate = 1;
ib_query_qp(ia->ri_id->qp, attr,
@@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
connstate = -ECONNABORTED;
- goto connected;
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- connstate = -ENODEV;
connected:
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
@@ -329,14 +344,6 @@ connected:
return 0;
}
-static void rpcrdma_destroy_id(struct rdma_cm_id *id)
-{
- if (id) {
- module_put(id->device->owner);
- rdma_destroy_id(id);
- }
-}
-
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
int rc;
init_completion(&ia->ri_done);
+ init_completion(&ia->ri_remove_done);
id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
IB_QPT_RC);
@@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
goto out;
}
- /* FIXME:
- * Until xprtrdma supports DEVICE_REMOVAL, the provider must
- * be pinned while there are active NFS/RDMA mounts to prevent
- * hangs and crashes at umount time.
- */
- if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
- dprintk("RPC: %s: Failed to get device module\n",
- __func__);
- ia->ri_async_rc = -ENODEV;
- }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
if (rc < 0) {
dprintk("RPC: %s: wait() exited: %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = ia->ri_async_rc;
if (rc)
- goto put;
+ goto out;
return id;
-put:
- module_put(id->device->owner);
+
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -413,13 +410,16 @@ out:
* Exported functions.
*/
-/*
- * Open and initialize an Interface Adapter.
- * o initializes fields of struct rpcrdma_ia, including
- * interface and provider attributes and protection zone.
+/**
+ * rpcrdma_ia_open - Open and initialize an Interface Adapter.
+ * @xprt: controlling transport
+ * @addr: IP address of remote peer
+ *
+ * Returns 0 on success, negative errno if an appropriate
+ * Interface Adapter could not be found and opened.
*/
int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
@@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
- goto out1;
+ goto out_err;
}
ia->ri_device = ia->ri_id->device;
@@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out2;
+ goto out_err;
}
- switch (memreg) {
+ switch (xprt_rdma_memreg_strategy) {
case RPCRDMA_FRMR:
if (frwr_is_supported(ia)) {
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
@@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
/*FALLTHROUGH*/
default:
- pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
- memreg);
+ pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
+ ia->ri_device->name, xprt_rdma_memreg_strategy);
rc = -EINVAL;
- goto out3;
+ goto out_err;
}
return 0;
-out3:
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-out2:
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
-out1:
+out_err:
+ rpcrdma_ia_close(ia);
return rc;
}
-/*
- * Clean up/close an IA.
- * o if event handles and PD have been initialized, free them.
- * o close the IA
+/**
+ * rpcrdma_ia_remove - Handle device driver unload
+ * @ia: interface adapter being removed
+ *
+ * Divest transport H/W resources associated with this adapter,
+ * but allow it to be restored later.
+ */
+void
+rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+ rx_ia);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+ struct rpcrdma_rep *rep;
+
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
+
+ /* This is similar to rpcrdma_ep_destroy, but:
+ * - Don't cancel the connect worker.
+ * - Don't call rpcrdma_ep_disconnect, which waits
+ * for another conn upcall, which will deadlock.
+ * - rdma_disconnect is unneeded, the underlying
+ * connection is already gone.
+ */
+ if (ia->ri_id->qp) {
+ ib_drain_qp(ia->ri_id->qp);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
+ }
+ ib_free_cq(ep->rep_attr.recv_cq);
+ ib_free_cq(ep->rep_attr.send_cq);
+
+ /* The ULP is responsible for ensuring all DMA
+ * mappings and MRs are gone.
+ */
+ list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
+ rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+ rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
+ }
+ rpcrdma_destroy_mrs(buf);
+
+ /* Allow waiters to continue */
+ complete(&ia->ri_remove_done);
+}
+
+/**
+ * rpcrdma_ia_close - Clean up/close an IA.
+ * @ia: interface adapter to close
+ *
*/
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
@@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
+ rdma_destroy_id(ia->ri_id);
}
+ ia->ri_id = NULL;
+ ia->ri_device = NULL;
/* If the pd is still busy, xprtrdma missed freeing a resource */
if (ia->ri_pd && !IS_ERR(ia->ri_pd))
ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
}
/*
@@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.send_cq);
}
+/* Re-establish a connection after a device removal event.
+ * Unlike a normal reconnection, a fresh PD and a new set
+ * of MRs and buffers is needed.
+ */
+static int
+rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ int rc, err;
+
+ pr_info("%s: r_xprt = %p\n", __func__, r_xprt);
+
+ rc = -EHOSTUNREACH;
+ if (rpcrdma_ia_open(r_xprt, sap))
+ goto out1;
+
+ rc = -ENOMEM;
+ err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
+ if (err) {
+ pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
+ goto out2;
+ }
+
+ rc = -ENETUNREACH;
+ err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
+ goto out3;
+ }
+
+ rpcrdma_create_mrs(r_xprt);
+ return 0;
+
+out3:
+ rpcrdma_ep_destroy(ep, ia);
+out2:
+ rpcrdma_ia_close(ia);
+out1:
+ return rc;
+}
+
+static int
+rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
+ struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ struct rdma_cm_id *id, *old;
+ int err, rc;
+
+ dprintk("RPC: %s: reconnecting...\n", __func__);
+
+ rpcrdma_ep_disconnect(ep, ia);
+
+ rc = -EHOSTUNREACH;
+ id = rpcrdma_create_id(r_xprt, ia, sap);
+ if (IS_ERR(id))
+ goto out;
+
+ /* As long as the new ID points to the same device as the
+ * old ID, we can reuse the transport's existing PD and all
+ * previously allocated MRs. Also, the same device means
+ * the transport's previous DMA mappings are still valid.
+ *
+ * This is a sanity check only. There should be no way these
+ * point to two different devices here.
+ */
+ old = id;
+ rc = -ENETUNREACH;
+ if (ia->ri_device != id->device) {
+ pr_err("rpcrdma: can't reconnect on different device!\n");
+ goto out_destroy;
+ }
+
+ err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ dprintk("RPC: %s: rdma_create_qp returned %d\n",
+ __func__, err);
+ goto out_destroy;
+ }
+
+ /* Atomically replace the transport's ID and QP. */
+ rc = 0;
+ old = ia->ri_id;
+ ia->ri_id = id;
+ rdma_destroy_qp(old);
+
+out_destroy:
+ rdma_destroy_id(old);
+out:
+ return rc;
+}
+
/*
* Connect unconnected endpoint.
*/
@@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
- struct rdma_cm_id *id, *old;
- struct sockaddr *sap;
unsigned int extras;
- int rc = 0;
+ int rc;
- if (ep->rep_connected != 0) {
retry:
- dprintk("RPC: %s: reconnecting...\n", __func__);
-
- rpcrdma_ep_disconnect(ep, ia);
-
- sap = (struct sockaddr *)&r_xprt->rx_data.addr;
- id = rpcrdma_create_id(r_xprt, ia, sap);
- if (IS_ERR(id)) {
- rc = -EHOSTUNREACH;
- goto out;
- }
- /* TEMP TEMP TEMP - fail if new device:
- * Deregister/remarshal *all* requests!
- * Close and recreate adapter, pd, etc!
- * Re-determine all attributes still sane!
- * More stuff I haven't thought of!
- * Rrrgh!
- */
- if (ia->ri_device != id->device) {
- printk("RPC: %s: can't reconnect on "
- "different device!\n", __func__);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
- /* END TEMP */
- rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
- if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
-
- old = ia->ri_id;
- ia->ri_id = id;
-
- rdma_destroy_qp(old);
- rpcrdma_destroy_id(old);
- } else {
+ switch (ep->rep_connected) {
+ case 0:
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- /* do not update ep->rep_connected */
- return -ENETUNREACH;
+ rc = -ENETUNREACH;
+ goto out_noupdate;
}
+ break;
+ case -ENODEV:
+ rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+ if (rc)
+ goto out_noupdate;
+ break;
+ default:
+ rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+ if (rc)
+ goto out;
}
ep->rep_connected = 0;
@@ -736,6 +845,8 @@ retry:
out:
if (rc)
ep->rep_connected = rc;
+
+out_noupdate:
return rc;
}
@@ -878,7 +989,6 @@ struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_rep *rep;
int rc;
@@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
- rep->rr_device = ia->ri_device;
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
@@ -1037,6 +1146,7 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
cancel_delayed_work_sync(&buf->rb_recovery_worker);
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
@@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
out_nomws:
dprintk("RPC: %s: no MWs available\n", __func__);
- schedule_delayed_work(&buf->rb_refresh_worker, 0);
+ if (r_xprt->rx_ep.rep_connected != -ENODEV)
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
/* Allow the reply handler and refresh worker to run */
cond_resched();
@@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
bool
__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
+ struct ib_device *device = ia->ri_device;
+
if (rb->rg_direction == DMA_NONE)
return false;
- rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+ rb->rg_iov.addr = ib_dma_map_single(device,
(void *)rb->rg_base,
rdmab_length(rb),
rb->rg_direction);
- if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+ if (ib_dma_mapping_error(device, rdmab_addr(rb)))
return false;
- rb->rg_device = ia->ri_device;
+ rb->rg_device = device;
rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
return true;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 171a35116de9..1d66acf1a723 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -69,6 +69,7 @@ struct rpcrdma_ia {
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct completion ri_done;
+ struct completion ri_remove_done;
int ri_async_rc;
unsigned int ri_max_segs;
unsigned int ri_max_frmr_depth;
@@ -78,10 +79,15 @@ struct rpcrdma_ia {
bool ri_reminv_expected;
bool ri_implicit_roundup;
enum ib_mr_type ri_mrtype;
+ unsigned long ri_flags;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
+enum {
+ RPCRDMA_IAF_REMOVING = 0,
+};
+
/*
* RDMA Endpoint -- one per transport instance
*/
@@ -164,6 +170,12 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
+static inline struct ib_device *
+rdmab_device(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_device;
+}
+
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
/* To ensure a transport can always make forward progress,
@@ -209,7 +221,6 @@ struct rpcrdma_rep {
unsigned int rr_len;
int rr_wc_flags;
u32 rr_inv_rkey;
- struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
struct list_head rr_list;
@@ -380,7 +391,6 @@ struct rpcrdma_buffer {
spinlock_t rb_mwlock; /* protect rb_mws list */
struct list_head rb_mws;
struct list_head rb_all;
- char *rb_pool;
spinlock_t rb_lock; /* protect buf lists */
int rb_send_count, rb_recv_count;
@@ -497,10 +507,16 @@ struct rpcrdma_xprt {
* Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
extern int xprt_rdma_pad_optimize;
+/* This setting controls the hunt for a supported memory
+ * registration strategy.
+ */
+extern unsigned int xprt_rdma_memreg_strategy;
+
/*
* Interface Adapter calls - xprtrdma/verbs.c
*/
-int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr);
+void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
void rpcrdma_ia_close(struct rpcrdma_ia *);
bool frwr_is_supported(struct rpcrdma_ia *);
bool fmr_is_supported(struct rpcrdma_ia *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 16aff8ddc16f..d5b54c020dec 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2432,7 +2432,12 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -ENETUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
- /* retry with existing socket, after a delay */
+ /*
+ * xs_tcp_force_close() wakes tasks with -EIO.
+ * We need to wake them first to ensure the
+ * correct error code.
+ */
+ xprt_wake_pending_tasks(xprt, status);
xs_tcp_force_close(xprt);
goto out;
}
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 312ef7de57d7..ab3087687a32 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -508,7 +508,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
}
if (skb_cloned(_skb) &&
- pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL))
+ pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
goto exit;
/* Now reverse the concerned fields */
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 0d4f2f455a7c..1b92b72e812f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -362,25 +362,25 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout)
return 0;
}
-#define tipc_wait_for_cond(sock_, timeout_, condition_) \
-({ \
- int rc_ = 0; \
- int done_ = 0; \
- \
- while (!(condition_) && !done_) { \
- struct sock *sk_ = sock->sk; \
- DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
- \
- rc_ = tipc_sk_sock_err(sock_, timeout_); \
- if (rc_) \
- break; \
- prepare_to_wait(sk_sleep(sk_), &wait_, \
- TASK_INTERRUPTIBLE); \
- done_ = sk_wait_event(sk_, timeout_, \
- (condition_), &wait_); \
- remove_wait_queue(sk_sleep(sk_), &wait_); \
- } \
- rc_; \
+#define tipc_wait_for_cond(sock_, timeo_, condition_) \
+({ \
+ struct sock *sk_; \
+ int rc_; \
+ \
+ while ((rc_ = !(condition_))) { \
+ DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
+ sk_ = (sock_)->sk; \
+ rc_ = tipc_sk_sock_err((sock_), timeo_); \
+ if (rc_) \
+ break; \
+ prepare_to_wait(sk_sleep(sk_), &wait_, TASK_INTERRUPTIBLE); \
+ release_sock(sk_); \
+ *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \
+ sched_annotate_sleep(); \
+ lock_sock(sk_); \
+ remove_wait_queue(sk_sleep(sk_), &wait_); \
+ } \
+ rc_; \
})
/**
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6a7fe7660551..c77ced0109b7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -343,7 +343,7 @@ found:
* are still connected to it and there's no way to inform "a polling
* implementation" that it should let go of a certain wait queue
*
- * In order to propagate a wake up, a wait_queue_t of the client
+ * In order to propagate a wake up, a wait_queue_entry_t of the client
* socket is enqueued on the peer_wait queue of the server socket
* whose wake function does a wake_up on the ordinary client socket
* wait queue. This connection is established whenever a write (or
@@ -352,7 +352,7 @@ found:
* was relayed.
*/
-static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
void *key)
{
struct unix_sock *u;
@@ -999,7 +999,8 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct path path = { };
err = -EINVAL;
- if (sunaddr->sun_family != AF_UNIX)
+ if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
+ sunaddr->sun_family != AF_UNIX)
goto out;
if (addr_len == sizeof(short)) {
@@ -1110,6 +1111,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
unsigned int hash;
int err;
+ err = -EINVAL;
+ if (alen < offsetofend(struct sockaddr, sa_family))
+ goto out;
+
if (addr->sa_family != AF_UNSPEC) {
err = unix_mkname(sunaddr, alen, &hash);
if (err < 0)
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 6f7f6757ceef..dfc8c51e4d74 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1540,8 +1540,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
long timeout;
int err;
struct vsock_transport_send_notify_data send_data;
-
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
sk = sock->sk;
vsk = vsock_sk(sk);
@@ -1584,11 +1583,10 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
if (err < 0)
goto out;
-
while (total_written < len) {
ssize_t written;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
while (vsock_stream_has_space(vsk) == 0 &&
sk->sk_err == 0 &&
!(sk->sk_shutdown & SEND_SHUTDOWN) &&
@@ -1597,33 +1595,30 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
/* Don't wait for non-blocking sockets. */
if (timeout == 0) {
err = -EAGAIN;
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
}
err = transport->notify_send_pre_block(vsk, &send_data);
if (err < 0) {
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
}
release_sock(sk);
- timeout = schedule_timeout(timeout);
+ timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
lock_sock(sk);
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
} else if (timeout == 0) {
err = -EAGAIN;
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
}
-
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
/* These checks occur both as part of and after the loop
* conditional since we need to check before and after
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 9dffe0282ad4..403d86e80162 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -576,9 +576,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->vdev = vdev;
- ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
- vsock->vqs, callbacks, names,
- NULL);
+ ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names,
+ NULL);
if (ret < 0)
goto out;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 14d5f0c8c45f..9f0901f3e42b 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -322,9 +322,9 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid)
{
struct cfg80211_sched_scan_request *pos;
- ASSERT_RTNL();
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
- list_for_each_entry(pos, &rdev->sched_scan_req_list, list) {
+ list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list) {
if (pos->reqid == reqid)
return pos;
}
@@ -398,13 +398,13 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid)
trace_cfg80211_sched_scan_results(wiphy, reqid);
/* ignore if we're not scanning */
- rtnl_lock();
+ rcu_read_lock();
request = cfg80211_find_sched_scan_req(rdev, reqid);
if (request) {
request->report_results = true;
queue_work(cfg80211_wq, &rdev->sched_scan_res_wk);
}
- rtnl_unlock();
+ rcu_read_unlock();
}
EXPORT_SYMBOL(cfg80211_sched_scan_results);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 7198373e2920..4992f1025c9d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -454,6 +454,8 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
if (iftype == NL80211_IFTYPE_MESH_POINT)
skb_copy_bits(skb, hdrlen, &mesh_flags, 1);
+ mesh_flags &= MESH_FLAGS_AE;
+
switch (hdr->frame_control &
cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
case cpu_to_le16(IEEE80211_FCTL_TODS):
@@ -469,9 +471,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
iftype != NL80211_IFTYPE_STATION))
return -1;
if (iftype == NL80211_IFTYPE_MESH_POINT) {
- if (mesh_flags & MESH_FLAGS_AE_A4)
+ if (mesh_flags == MESH_FLAGS_AE_A4)
return -1;
- if (mesh_flags & MESH_FLAGS_AE_A5_A6) {
+ if (mesh_flags == MESH_FLAGS_AE_A5_A6) {
skb_copy_bits(skb, hdrlen +
offsetof(struct ieee80211s_hdr, eaddr1),
tmp.h_dest, 2 * ETH_ALEN);
@@ -487,9 +489,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
ether_addr_equal(tmp.h_source, addr)))
return -1;
if (iftype == NL80211_IFTYPE_MESH_POINT) {
- if (mesh_flags & MESH_FLAGS_AE_A5_A6)
+ if (mesh_flags == MESH_FLAGS_AE_A5_A6)
return -1;
- if (mesh_flags & MESH_FLAGS_AE_A4)
+ if (mesh_flags == MESH_FLAGS_AE_A4)
skb_copy_bits(skb, hdrlen +
offsetof(struct ieee80211s_hdr, eaddr1),
tmp.h_source, ETH_ALEN);
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 1a4db6790e20..6cdb054484d6 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -914,13 +914,12 @@ int call_commit_handler(struct net_device *dev)
* Main IOCTl dispatcher.
* Check the type of IOCTL and call the appropriate wrapper...
*/
-static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
+static int wireless_process_ioctl(struct net *net, struct iwreq *iwr,
unsigned int cmd,
struct iw_request_info *info,
wext_ioctl_func standard,
wext_ioctl_func private)
{
- struct iwreq *iwr = (struct iwreq *) ifr;
struct net_device *dev;
iw_handler handler;
@@ -928,7 +927,7 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
* The copy_to/from_user() of ifr is also dealt with in there */
/* Make sure the device exist */
- if ((dev = __dev_get_by_name(net, ifr->ifr_name)) == NULL)
+ if ((dev = __dev_get_by_name(net, iwr->ifr_name)) == NULL)
return -ENODEV;
/* A bunch of special cases, then the generic case...
@@ -957,9 +956,6 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
else if (private)
return private(dev, iwr, cmd, info, handler);
}
- /* Old driver API : call driver ioctl handler */
- if (dev->netdev_ops->ndo_do_ioctl)
- return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
return -EOPNOTSUPP;
}
@@ -977,7 +973,7 @@ static int wext_permission_check(unsigned int cmd)
}
/* entry point from dev ioctl */
-static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr,
+static int wext_ioctl_dispatch(struct net *net, struct iwreq *iwr,
unsigned int cmd, struct iw_request_info *info,
wext_ioctl_func standard,
wext_ioctl_func private)
@@ -987,9 +983,9 @@ static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr,
if (ret)
return ret;
- dev_load(net, ifr->ifr_name);
+ dev_load(net, iwr->ifr_name);
rtnl_lock();
- ret = wireless_process_ioctl(net, ifr, cmd, info, standard, private);
+ ret = wireless_process_ioctl(net, iwr, cmd, info, standard, private);
rtnl_unlock();
return ret;
@@ -1039,18 +1035,18 @@ static int ioctl_standard_call(struct net_device * dev,
}
-int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
+int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
void __user *arg)
{
struct iw_request_info info = { .cmd = cmd, .flags = 0 };
int ret;
- ret = wext_ioctl_dispatch(net, ifr, cmd, &info,
+ ret = wext_ioctl_dispatch(net, iwr, cmd, &info,
ioctl_standard_call,
ioctl_private_call);
if (ret >= 0 &&
IW_IS_GET(cmd) &&
- copy_to_user(arg, ifr, sizeof(struct iwreq)))
+ copy_to_user(arg, iwr, sizeof(struct iwreq)))
return -EFAULT;
return ret;
@@ -1107,7 +1103,7 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
info.cmd = cmd;
info.flags = IW_REQUEST_FLAG_COMPAT;
- ret = wext_ioctl_dispatch(net, (struct ifreq *) &iwr, cmd, &info,
+ ret = wext_ioctl_dispatch(net, &iwr, cmd, &info,
compat_standard_call,
compat_private_call);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 8b911c29860e..5a1a98df3499 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1791,32 +1791,40 @@ void x25_kill_by_neigh(struct x25_neigh *nb)
static int __init x25_init(void)
{
- int rc = proto_register(&x25_proto, 0);
+ int rc;
- if (rc != 0)
+ rc = proto_register(&x25_proto, 0);
+ if (rc)
goto out;
rc = sock_register(&x25_family_ops);
- if (rc != 0)
+ if (rc)
goto out_proto;
dev_add_pack(&x25_packet_type);
rc = register_netdevice_notifier(&x25_dev_notifier);
- if (rc != 0)
+ if (rc)
goto out_sock;
- pr_info("Linux Version 0.2\n");
+ rc = x25_register_sysctl();
+ if (rc)
+ goto out_dev;
- x25_register_sysctl();
rc = x25_proc_init();
- if (rc != 0)
- goto out_dev;
+ if (rc)
+ goto out_sysctl;
+
+ pr_info("Linux Version 0.2\n");
+
out:
return rc;
+out_sysctl:
+ x25_unregister_sysctl();
out_dev:
unregister_netdevice_notifier(&x25_dev_notifier);
out_sock:
+ dev_remove_pack(&x25_packet_type);
sock_unregister(AF_X25);
out_proto:
proto_unregister(&x25_proto);
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
index a06dfe143c67..ba078c85f0a1 100644
--- a/net/x25/sysctl_net_x25.c
+++ b/net/x25/sysctl_net_x25.c
@@ -73,9 +73,12 @@ static struct ctl_table x25_table[] = {
{ },
};
-void __init x25_register_sysctl(void)
+int __init x25_register_sysctl(void)
{
x25_table_header = register_net_sysctl(&init_net, "net/x25", x25_table);
+ if (!x25_table_header)
+ return -ENOMEM;
+ return 0;
}
void x25_unregister_sysctl(void)
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index abf81b329dc1..55b2ac300995 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -4,8 +4,7 @@
obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
xfrm_input.o xfrm_output.o \
- xfrm_sysctl.o xfrm_replay.o
-obj-$(CONFIG_XFRM_OFFLOAD) += xfrm_device.o
+ xfrm_sysctl.o xfrm_replay.o xfrm_device.o
obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 8ec8a3fcf8d4..5aba03685d7d 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -22,6 +22,7 @@
#include <net/xfrm.h>
#include <linux/notifier.h>
+#ifdef CONFIG_XFRM_OFFLOAD
int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
{
int err;
@@ -137,6 +138,7 @@ ok:
return true;
}
EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);
+#endif
int xfrm_dev_register(struct net_device *dev)
{
@@ -170,7 +172,7 @@ static int xfrm_dev_feat_change(struct net_device *dev)
static int xfrm_dev_down(struct net_device *dev)
{
- if (dev->hw_features & NETIF_F_HW_ESP)
+ if (dev->features & NETIF_F_HW_ESP)
xfrm_dev_state_flush(dev_net(dev), dev, true);
xfrm_garbage_collect(dev_net(dev));
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b00a1d5a7f52..643a18f72032 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1006,10 +1006,6 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
err = -ESRCH;
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-
- if (cnt)
- xfrm_garbage_collect(net);
-
return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);
@@ -1797,43 +1793,6 @@ free_dst:
goto out;
}
-#ifdef CONFIG_XFRM_SUB_POLICY
-static int xfrm_dst_alloc_copy(void **target, const void *src, int size)
-{
- if (!*target) {
- *target = kmalloc(size, GFP_ATOMIC);
- if (!*target)
- return -ENOMEM;
- }
-
- memcpy(*target, src, size);
- return 0;
-}
-#endif
-
-static int xfrm_dst_update_parent(struct dst_entry *dst,
- const struct xfrm_selector *sel)
-{
-#ifdef CONFIG_XFRM_SUB_POLICY
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- return xfrm_dst_alloc_copy((void **)&(xdst->partner),
- sel, sizeof(*sel));
-#else
- return 0;
-#endif
-}
-
-static int xfrm_dst_update_origin(struct dst_entry *dst,
- const struct flowi *fl)
-{
-#ifdef CONFIG_XFRM_SUB_POLICY
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
-#else
- return 0;
-#endif
-}
-
static int xfrm_expand_policies(const struct flowi *fl, u16 family,
struct xfrm_policy **pols,
int *num_pols, int *num_xfrms)
@@ -1905,16 +1864,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
xdst = (struct xfrm_dst *)dst;
xdst->num_xfrms = err;
- if (num_pols > 1)
- err = xfrm_dst_update_parent(dst, &pols[1]->selector);
- else
- err = xfrm_dst_update_origin(dst, fl);
- if (unlikely(err)) {
- dst_free(dst);
- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
- return ERR_PTR(err);
- }
-
xdst->num_pols = num_pols;
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
xdst->policy_genid = atomic_read(&pols[0]->genid);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index fc3c5aa38754..2e291bc5f1fc 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1383,6 +1383,8 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig)
x->curlft.add_time = orig->curlft.add_time;
x->km.state = orig->km.state;
x->km.seq = orig->km.seq;
+ x->replay = orig->replay;
+ x->preplay = orig->preplay;
return x;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 38614df33ec8..86116e9aaf3d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2027,6 +2027,7 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
return err;
}
+ xfrm_garbage_collect(net);
c.data.type = type;
c.event = nlh->nlmsg_type;
OpenPOWER on IntegriCloud