136 files changed, 5584 insertions, 2820 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a1e273af6fc8..82a116ba590e 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -290,6 +290,10 @@ static void vlan_sync_address(struct net_device *dev,
 	if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
 		return;
 
+	/* vlan continues to inherit address of lower device */
+	if (vlan_dev_inherit_address(vlandev, dev))
+		goto out;
+
 	/* vlan address was different from the old address and is equal to
 	 * the new address */
 	if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -302,6 +306,7 @@ static void vlan_sync_address(struct net_device *dev,
 	    !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
 		dev_uc_add(dev, vlandev->dev_addr);
 
+out:
 	ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 9d010a09ab98..cc1557978066 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_device *real_dev,
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+			      struct net_device *real_dev);
 
 static inline u32 vlan_get_ingress_priority(struct net_device *dev,
 					    u16 vlan_tci)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e7e62570bdb8..86ae75b77390 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -245,6 +245,17 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
 	strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
 }
 
+bool vlan_dev_inherit_address(struct net_device *dev,
+			      struct net_device *real_dev)
+{
+	if (dev->addr_assign_type != NET_ADDR_STOLEN)
+		return false;
+
+	ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return true;
+}
+
 static int vlan_dev_open(struct net_device *dev)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -255,7 +266,8 @@ static int vlan_dev_open(struct net_device *dev)
 	    !(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
 		return -ENETDOWN;
 
-	if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+	if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+	    !vlan_dev_inherit_address(dev, real_dev)) {
 		err = dev_uc_add(real_dev, dev->dev_addr);
 		if (err < 0)
 			goto out;
@@ -560,8 +572,10 @@ static int vlan_dev_init(struct net_device *dev)
 	/* ipv6 shared card related stuff */
 	dev->dev_id = real_dev->dev_id;
 
-	if (is_zero_ether_addr(dev->dev_addr))
-		eth_hw_addr_inherit(dev, real_dev);
+	if (is_zero_ether_addr(dev->dev_addr)) {
+		ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+		dev->addr_assign_type = NET_ADDR_STOLEN;
+	}
 	if (is_zero_ether_addr(dev->broadcast))
 		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
 
diff --git a/net/9p/client.c b/net/9p/client.c
index ea79ee9a7348..3fc94a49ccd5 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 		if (err)
 			goto out_err;
 
-		if (p9_is_proto_dotu(c))
+		if (p9_is_proto_dotu(c) && ecode < 512)
 			err = -ecode;
 
-		if (!err || !IS_ERR_VALUE(err)) {
+		if (!err) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
 			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
@@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
 		if (err)
 			goto out_err;
 
-		if (p9_is_proto_dotu(c))
+		if (p9_is_proto_dotu(c) && ecode < 512)
 			err = -ecode;
 
-		if (!err || !IS_ERR_VALUE(err)) {
+		if (!err) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
 			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 4fd6af47383a..adb6e3d21b1e 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -124,7 +124,7 @@ as_indicate_complete:
 		break;
 	case as_addparty:
 	case as_dropparty:
-		sk->sk_err_soft = msg->reply;
+		sk->sk_err_soft = -msg->reply;
 					/* < 0 failure, otherwise ep_ref */
 		clear_bit(ATM_VF_WAITING, &vcc->flags);
 		break;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 3fa0a9ee98d1..878563a8354d 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -546,7 +546,7 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
 		schedule();
 	}
 	finish_wait(sk_sleep(sk), &wait);
-	error = xchg(&sk->sk_err_soft, 0);
+	error = -xchg(&sk->sk_err_soft, 0);
 out:
 	release_sock(sk);
 	return error;
@@ -573,7 +573,7 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
 		error = -EUNATCH;
 		goto out;
 	}
-	error = xchg(&sk->sk_err_soft, 0);
+	error = -xchg(&sk->sk_err_soft, 0);
 out:
 	release_sock(sk);
 	return error;
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 7f98a9d39883..ce2f203048d3 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -157,10 +157,8 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
 	orig_node->bat_iv.bcast_own = data_ptr;
 
 	data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
-	if (!data_ptr) {
-		kfree(orig_node->bat_iv.bcast_own);
+	if (!data_ptr)
 		goto unlock;
-	}
 
 	memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
 	       (max_if_num - 1) * sizeof(u8));
@@ -1182,9 +1180,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 	u8 total_count;
 	u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
 	unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
-	int tq_asym_penalty, inv_asym_penalty, if_num;
+	int if_num;
+	unsigned int tq_asym_penalty, inv_asym_penalty;
 	unsigned int combined_tq;
-	int tq_iface_penalty;
+	unsigned int tq_iface_penalty;
 	bool ret = false;
 
 	/* find corresponding one hop neighbor */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 3ff8bd1b7bdc..0a12e5cdd65d 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -27,6 +27,7 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
+#include <linux/stddef.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
@@ -39,6 +40,16 @@
 
 static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
 {
+	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct batadv_hard_iface *primary_if;
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+
+	if (primary_if) {
+		batadv_v_elp_iface_activate(primary_if, hard_iface);
+		batadv_hardif_put(primary_if);
+	}
+
 	/* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can
 	 * set the interface as ACTIVE right away, without any risk of race
 	 * condition
@@ -72,16 +83,34 @@ static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
 	batadv_v_elp_iface_disable(hard_iface);
 }
 
-static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
-{
-}
-
 static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
 {
 	batadv_v_elp_primary_iface_set(hard_iface);
 	batadv_v_ogm_primary_iface_set(hard_iface);
 }
 
+/**
+ * batadv_v_iface_update_mac - react to hard-interface MAC address change
+ * @hard_iface: the modified interface
+ *
+ * If the modified interface is the primary one, update the originator
+ * address in the ELP and OGM messages to reflect the new MAC address.
+ */
+static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct batadv_hard_iface *primary_if;
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (primary_if != hard_iface)
+		goto out;
+
+	batadv_v_primary_iface_set(hard_iface);
+out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+}
+
 static void
 batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
 {
@@ -255,14 +284,23 @@ static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
 			      struct batadv_hard_iface *if_outgoing2)
 {
 	struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+	int ret = 0;
 
 	ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+	if (WARN_ON(!ifinfo1))
+		goto err_ifinfo1;
+
 	ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+	if (WARN_ON(!ifinfo2))
+		goto err_ifinfo2;
 
-	if (WARN_ON(!ifinfo1 || !ifinfo2))
-		return 0;
+	ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
 
-	return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
+	batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+	batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+	return ret;
 }
 
 static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
@@ -272,14 +310,26 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
 {
 	struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
 	u32 threshold;
+	bool ret = false;
 
 	ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+	if (WARN_ON(!ifinfo1))
+		goto err_ifinfo1;
+
 	ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+	if (WARN_ON(!ifinfo2))
+		goto err_ifinfo2;
 
 	threshold = ifinfo1->bat_v.throughput / 4;
 	threshold = ifinfo1->bat_v.throughput - threshold;
 
-	return ifinfo2->bat_v.throughput > threshold;
+	ret = ifinfo2->bat_v.throughput > threshold;
+
+	batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+	batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+	return ret;
 }
 
 static struct batadv_algo_ops batadv_batman_v __read_mostly = {
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 3844e7efd0b0..df42eb1365a0 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -377,6 +377,27 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
 }
 
 /**
+ * batadv_v_elp_iface_activate - update the ELP buffer belonging to the given
+ *  hard-interface
+ * @primary_iface: the new primary interface
+ * @hard_iface: interface holding the to-be-updated buffer
+ */
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+				 struct batadv_hard_iface *hard_iface)
+{
+	struct batadv_elp_packet *elp_packet;
+	struct sk_buff *skb;
+
+	if (!hard_iface->bat_v.elp_skb)
+		return;
+
+	skb = hard_iface->bat_v.elp_skb;
+	elp_packet = (struct batadv_elp_packet *)skb->data;
+	ether_addr_copy(elp_packet->orig,
+			primary_iface->net_dev->dev_addr);
+}
+
+/**
  * batadv_v_elp_primary_iface_set - change internal data to reflect the new
  *  primary interface
  * @primary_iface: the new primary interface
@@ -384,8 +405,6 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
 void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
 {
 	struct batadv_hard_iface *hard_iface;
-	struct batadv_elp_packet *elp_packet;
-	struct sk_buff *skb;
 
 	/* update orig field of every elp iface belonging to this mesh */
 	rcu_read_lock();
@@ -393,13 +412,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
 		if (primary_iface->soft_iface != hard_iface->soft_iface)
 			continue;
 
-		if (!hard_iface->bat_v.elp_skb)
-			continue;
-
-		skb = hard_iface->bat_v.elp_skb;
-		elp_packet = (struct batadv_elp_packet *)skb->data;
-		ether_addr_copy(elp_packet->orig,
-				primary_iface->net_dev->dev_addr);
+		batadv_v_elp_iface_activate(primary_iface, hard_iface);
 	}
 	rcu_read_unlock();
 }
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index e95f1bca0785..cc130b2d05e5 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -25,6 +25,8 @@ struct work_struct;
 
 int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
 void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+				 struct batadv_hard_iface *hard_iface);
 void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
 int batadv_v_elp_packet_recv(struct sk_buff *skb,
 			     struct batadv_hard_iface *if_incoming);
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 1ff4ee473966..7f51bc2c06eb 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -619,6 +619,8 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	struct batadv_neigh_node *neigh_node;
 	struct batadv_hardif_neigh_node *hardif_neigh = NULL;
 
+	spin_lock_bh(&orig_node->neigh_list_lock);
+
 	neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
 	if (neigh_node)
 		goto out;
@@ -650,15 +652,15 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	kref_init(&neigh_node->refcount);
 	kref_get(&neigh_node->refcount);
 
-	spin_lock_bh(&orig_node->neigh_list_lock);
 	hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
-	spin_unlock_bh(&orig_node->neigh_list_lock);
 
 	batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
 		   "Creating new neighbor %pM for orig_node %pM on interface %s\n",
 		   neigh_addr, orig_node->orig, hard_iface->net_dev->name);
 
 out:
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
 	if (hardif_neigh)
 		batadv_hardif_neigh_put(hardif_neigh);
 	return neigh_node;
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index ae850f2d11cb..e3857ed4057f 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -601,6 +601,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 	struct batadv_unicast_packet *unicast_packet;
 	struct ethhdr *ethhdr = eth_hdr(skb);
 	int res, hdr_len, ret = NET_RX_DROP;
+	unsigned int len;
 
 	unicast_packet = (struct batadv_unicast_packet *)skb->data;
 
@@ -641,6 +642,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 	if (hdr_len > 0)
 		batadv_skb_set_priority(skb, hdr_len);
 
+	len = skb->len;
 	res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
 
 	/* translate transmit result into receive result */
@@ -648,7 +650,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
 		/* skb was transmitted and consumed */
 		batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
 		batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
-				   skb->len + ETH_HLEN);
+				   len + ETH_HLEN);
 
 		ret = NET_RX_SUCCESS;
 	} else if (res == NET_XMIT_POLICED) {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index dcea4f4c62b3..c18080ad4085 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -279,6 +279,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	 * change from under us.
 	 */
 	list_for_each_entry(v, &vg->vlan_list, vlist) {
+		if (!br_vlan_should_use(v))
+			continue;
 		f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
 		if (f && f->is_local && !f->dst)
 			fdb_delete_local(br, NULL, f);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
 	return client->monc.monmap && client->monc.monmap->epoch &&
 	       client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
 	}
 }
 
+const char *ceph_osd_watch_op_name(int o)
+{
+	switch (o) {
+	case CEPH_OSD_WATCH_OP_UNWATCH:
+		return "unwatch";
+	case CEPH_OSD_WATCH_OP_WATCH:
+		return "watch";
+	case CEPH_OSD_WATCH_OP_RECONNECT:
+		return "reconnect";
+	case CEPH_OSD_WATCH_OP_PING:
+		return "ping";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_state_name(int s)
 {
 	switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
 	int i;
 	struct ceph_client *client = s->private;
-	struct ceph_osdmap *map = client->osdc.osdmap;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
 	struct rb_node *n;
 
 	if (map == NULL)
 		return 0;
 
-	seq_printf(s, "epoch %d\n", map->epoch);
-	seq_printf(s, "flags%s%s\n",
-		   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
-		   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
+	down_read(&osdc->lock);
+	seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
 
 	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-		struct ceph_pg_pool_info *pool =
+		struct ceph_pg_pool_info *pi =
 			rb_entry(n, struct ceph_pg_pool_info, node);
 
-		seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
-			   pool->id, pool->pg_num, pool->pg_num_mask,
-			   pool->read_tier, pool->write_tier);
+		seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+			   pi->id, pi->name, pi->type, pi->size, pi->min_size,
+			   pi->pg_num, pi->pg_num_mask, pi->flags,
+			   pi->last_force_request_resend, pi->read_tier,
+			   pi->write_tier);
 	}
 	for (i = 0; i < map->max_osd; i++) {
 		struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 			   pg->pgid.seed, pg->primary_temp.osd);
 	}
 
+	up_read(&osdc->lock);
 	return 0;
 }
 
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
 					CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
 		seq_putc(s, '\n');
 	}
+	seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
 
 	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
 		__u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
+	int i;
 
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
-		unsigned int i;
-		int opcode;
+	seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+	for (i = 0; i < t->up.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+	seq_printf(s, "]/%d\t[", t->up.primary);
+	for (i = 0; i < t->acting.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+	seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+		   t->target_oid.name_len, t->target_oid.name, t->flags);
+	if (t->paused)
+		seq_puts(s, "\tP");
+}
 
-		req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+	int i;
 
-		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   req->r_pgid.pool, req->r_pgid.seed);
+	seq_printf(s, "%llu\t", req->r_tid);
+	dump_target(s, &req->r_t);
 
-		seq_printf(s, "%.*s", req->r_base_oid.name_len,
-			   req->r_base_oid.name);
+	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+		   le32_to_cpu(req->r_replay_version.epoch),
+		   le64_to_cpu(req->r_replay_version.version));
 
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+			   ceph_osd_op_name(op->op));
+		if (op->op == CEPH_OSD_OP_WATCH)
+			seq_printf(s, "-%s",
+				   ceph_osd_watch_op_name(op->watch.op));
+	}
+
+	seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		dump_request(s, req);
+	}
+
+	mutex_unlock(&osd->lock);
+}
 
-		for (i = 0; i < req->r_num_ops; i++) {
-			opcode = req->r_ops[i].op;
-			seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-				   ceph_osd_op_name(opcode));
-		}
+static void dump_linger_request(struct seq_file *s,
+				struct ceph_osd_linger_request *lreq)
+{
+	seq_printf(s, "%llu\t", lreq->linger_id);
+	dump_target(s, &lreq->t);
+
+	seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+		   lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+		   lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		dump_linger_request(s, lreq);
+	}
+
+	mutex_unlock(&osd->lock);
+}
 
-		seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *n;
+
+	down_read(&osdc->lock);
+	seq_printf(s, "REQUESTS %d homeless %d\n",
+		   atomic_read(&osdc->num_requests),
+		   atomic_read(&osdc->num_homeless));
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		dump_requests(s, osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	dump_requests(s, &osdc->homeless_osd);
+
+	seq_puts(s, "LINGER REQUESTS\n");
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		dump_linger_requests(s, osd);
+	}
+	dump_linger_requests(s, &osdc->homeless_osd);
+
+	up_read(&osdc->lock);
 	return 0;
 }
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 	BUG_ON(num < 1); /* monmap sub is always there */
 	ceph_encode_32(&p, num);
 	for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-		const char *s = ceph_sub_str[i];
+		char buf[32];
+		int len;
 
 		if (!monc->subs[i].want)
 			continue;
 
-		dout("%s %s start %llu flags 0x%x\n", __func__, s,
+		len = sprintf(buf, "%s", ceph_sub_str[i]);
+		if (i == CEPH_SUB_MDSMAP &&
+		    monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+			len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+		dout("%s %s start %llu flags 0x%x\n", __func__, buf,
 		     le64_to_cpu(monc->subs[i].item.start),
 		     monc->subs[i].item.flags);
-		ceph_encode_string(&p, end, s, strlen(s));
+		ceph_encode_string(&p, end, buf, len);
 		memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
 		p += sizeof(monc->subs[i].item);
 	}
 
-	BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+	BUG_ON(p > end);
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 	ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
 
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
 {
-	dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
 	mutex_lock(&monc->mutex);
-	if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
-				 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-		__send_subscribe(monc);
+	__send_subscribe(monc);
 	mutex_unlock(&monc->mutex);
 }
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
 
 /*
  * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
 /*
  * generic requests (currently statfs, mon_get_version)
  */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-	struct ceph_mon_client *monc, u64 tid)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *n = monc->generic_request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mon_generic_request, node);
-		if (tid < req->tid)
-			n = n->rb_left;
-		else if (tid > req->tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-			    struct ceph_mon_generic_request *new)
-{
-	struct rb_node **p = &monc->generic_request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mon_generic_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
-		if (new->tid < req->tid)
-			p = &(*p)->rb_left;
-		else if (new->tid > req->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 
 static void release_generic_request(struct kref *kref)
 {
 	struct ceph_mon_generic_request *req =
 		container_of(kref, struct ceph_mon_generic_request, kref);
 
+	dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+	     req->reply);
+	WARN_ON(!RB_EMPTY_NODE(&req->node));
+
 	if (req->reply)
 		ceph_msg_put(req->reply);
 	if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
 
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-	kref_put(&req->kref, release_generic_request);
+	if (req)
+		kref_put(&req->kref, release_generic_request);
 }
 
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
 	kref_get(&req->kref);
 }
 
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+	struct ceph_mon_generic_request *req;
+
+	req = kzalloc(sizeof(*req), gfp);
+	if (!req)
+		return NULL;
+
+	req->monc = monc;
+	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
+	init_completion(&req->completion);
+
+	dout("%s greq %p\n", __func__, req);
+	return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+
+	WARN_ON(req->tid);
+
+	get_generic_request(req);
+	req->tid = ++monc->last_tid;
+	insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+				 struct ceph_mon_generic_request *req)
+{
+	WARN_ON(!req->tid);
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	erase_generic_request(&monc->generic_request_tree, req);
+
+	ceph_msg_revoke(req->request);
+	ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+	__finish_generic_request(req);
+	put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+	if (req->complete_cb)
+		req->complete_cb(req);
+	else
+		complete_all(&req->completion);
+	put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+	struct ceph_mon_client *monc = req->monc;
+	struct ceph_mon_generic_request *lookup_req;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+	mutex_lock(&monc->mutex);
+	lookup_req = lookup_generic_request(&monc->generic_request_tree,
+					    req->tid);
+	if (lookup_req) {
+		WARN_ON(lookup_req != req);
+		finish_generic_request(req);
+	}
+
+	mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+	int ret;
+
+	dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+	ret = wait_for_completion_interruptible(&req->completion);
+	if (ret)
+		cancel_generic_request(req);
+	else
+		ret = req->result; /* completed */
+
+	return ret;
+}
+
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 					 struct ceph_msg_header *hdr,
 					 int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	struct ceph_msg *m;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (!req) {
 		dout("get_generic_reply %lld dne\n", tid);
 		*skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	return m;
 }
 
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-				struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	/* register request */
-	req->tid = tid != 0 ? tid : ++monc->last_tid;
-	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_generic_request(monc, req);
-	monc->num_generic_requests++;
-	ceph_con_send(&monc->con, ceph_msg_get(req->request));
-	mutex_unlock(&monc->mutex);
-
-	err = wait_for_completion_interruptible(&req->completion);
-
-	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->generic_request_tree);
-	monc->num_generic_requests--;
-
-	if (!err)
-		err = req->result;
-	return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-			      struct ceph_mon_generic_request *req)
-{
-	int err;
-
-	mutex_lock(&monc->mutex);
-	err = __do_generic_request(monc, 0, req);
-	mutex_unlock(&monc->mutex);
-
-	return err;
-}
-
 /*
  * statfs
  */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
-	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (req) {
-		*(struct ceph_statfs *)req->buf = reply->st;
-		req->result = 0;
-		get_generic_request(req);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
 	}
+
+	req->result = 0;
+	*req->u.st = reply->st; /* struct */
+	__finish_generic_request(req);
 	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
+
+	complete_generic_request(req);
 	return;
 
 bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
-	int err;
+	int ret = -ENOMEM;
 
-	req = kzalloc(sizeof(*req), GFP_NOFS);
+	req = alloc_generic_request(monc, GFP_NOFS);
 	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	req->buf = buf;
-	init_completion(&req->completion);
+		goto out;
 
-	err = -ENOMEM;
 	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 				    true);
 	if (!req->request)
 		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-				  true);
+
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
 	if (!req->reply)
 		goto out;
 
+	req->u.st = buf;
+
+	mutex_lock(&monc->mutex);
+	register_generic_request(req);
 	/* fill out request */
 	h = req->request->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
 
-	err = do_generic_request(monc, req);
-
+	ret = wait_generic_request(req);
 out:
 	put_generic_request(req);
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 	void *end = p + msg->front_alloc_len;
 	u64 handle;
 
-	dout("%s %p tid %llu\n", __func__, msg, tid);
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
 	ceph_decode_need(&p, end, 2*sizeof(u64), bad);
 	handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 		goto bad;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, handle);
-	if (req) {
-		*(u64 *)req->buf = ceph_decode_64(&p);
-		req->result = 0;
-		get_generic_request(req);
+	req = lookup_generic_request(&monc->generic_request_tree, handle);
+	if (!req) {
+		mutex_unlock(&monc->mutex);
+		return;
 	}
+
+	req->result = 0;
+	req->u.newest = ceph_decode_64(&p);
+	__finish_generic_request(req);
 	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
 
+	complete_generic_request(req);
 	return;
+
 bad:
 	pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-			     u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			ceph_monc_callback_t cb, u64 private_data)
 {
 	struct ceph_mon_generic_request *req;
-	void *p, *end;
-	u64 tid;
-	int err;
 
-	req = kzalloc(sizeof(*req), GFP_NOFS);
+	req = alloc_generic_request(monc, GFP_NOIO);
 	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	req->buf = newest;
-	init_completion(&req->completion);
+		goto err_put_req;
 
 	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
 				    sizeof(u64) + sizeof(u32) + strlen(what),
-				    GFP_NOFS, true);
-	if (!req->request) {
-		err = -ENOMEM;
-		goto out;
-	}
+				    GFP_NOIO, true);
+	if (!req->request)
+		goto err_put_req;
 
-	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
-				  GFP_NOFS, true);
-	if (!req->reply) {
-		err = -ENOMEM;
-		goto out;
-	}
+	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+				  true);
+	if (!req->reply)
+		goto err_put_req;
 
-	p = req->request->front.iov_base;
-	end = p + req->request->front_alloc_len;
+	req->complete_cb = cb;
+	req->private_data = private_data;
 
-	/* fill out request */
 	mutex_lock(&monc->mutex);
-	tid = ++monc->last_tid;
-	ceph_encode_64(&p, tid); /* handle */
-	ceph_encode_string(&p, end, what, strlen(what));
+	register_generic_request(req);
+	{
+		void *p = req->request->front.iov_base;
+		void *const end = p + req->request->front_alloc_len;
+
+		ceph_encode_64(&p, req->tid); /* handle */
+		ceph_encode_string(&p, end, what, strlen(what));
+		WARN_ON(p != end);
+	}
+	send_generic_request(monc, req);
+	mutex_unlock(&monc->mutex);
 
-	err = __do_generic_request(monc, tid, req);
+	return req;
 
-	mutex_unlock(&monc->mutex);
-out:
+err_put_req:
 	put_generic_request(req);
-	return err;
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+			  u64 *newest)
+{
+	struct ceph_mon_generic_request *req;
+	int ret;
+
+	req = __ceph_monc_get_version(monc, what, NULL, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	ret = wait_generic_request(req);
+	if (!ret)
+		*newest = req->u.newest;
+
+	put_generic_request(req);
+	return ret;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+				ceph_monc_callback_t cb, u64 private_data)
+{
+	struct ceph_mon_generic_request *req;
+
+	req = __ceph_monc_get_version(monc, what, cb, private_data);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	put_generic_request(req);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 
 /*
  * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	if (!monc->m_subscribe_ack)
 		goto out_auth;
 
-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
 					 true);
 	if (!monc->m_subscribe)
 		goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 	monc->generic_request_tree = RB_ROOT;
-	monc->num_generic_requests = 0;
 	monc->last_tid = 0;
 
+	monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
 	return 0;
 
 out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
 	ceph_auth_destroy(monc->auth);
 
+	WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
 	ceph_msg_put(monc->m_auth);
 	ceph_msg_put(monc->m_auth_reply);
 	ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..89469592076c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
 #include <linux/ceph/auth.h>
 #include <linux/ceph/pagelist.h>
 
-#define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
 
 static struct kmem_cache	*ceph_osd_request_cache;
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
  * channel with an OSD is reset.
  */
 
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+			struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+			  struct ceph_osd_linger_request *lreq);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+	bool wrlocked = true;
+
+	if (unlikely(down_read_trylock(sem))) {
+		wrlocked = false;
+		up_read(sem);
+	}
+
+	return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	WARN_ON(!(mutex_is_locked(&osd->lock) &&
+		  rwsem_is_locked(&osdc->lock)) &&
+		!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+	WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
 /*
  * calculate the mapping of a file extent onto an object, and fill out the
  * request accordingly.  shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-			unsigned int which)
-{
-	return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
-
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
 			unsigned int which, struct page **pages,
 			u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
 
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
+	osd_req->r_ops[which].cls.indata_len += pagelist->length;
+	osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
+	osd_req->r_ops[which].cls.indata_len += length;
+	osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_STAT:
 		ceph_osd_data_release(&op->raw_data_in);
 		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+		ceph_osd_data_release(&op->notify_ack.request_data);
+		break;
+	case CEPH_OSD_OP_NOTIFY:
+		ceph_osd_data_release(&op->notify.request_data);
+		ceph_osd_data_release(&op->notify.response_data);
+		break;
 	default:
 		break;
 	}
 }
 
 /*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+	ceph_oid_init(&t->base_oid);
+	ceph_oloc_init(&t->base_oloc);
+	ceph_oid_init(&t->target_oid);
+	ceph_oloc_init(&t->target_oloc);
+
+	ceph_osds_init(&t->acting);
+	ceph_osds_init(&t->up);
+	t->size = -1;
+	t->min_size = -1;
+
+	t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+			const struct ceph_osd_request_target *src)
+{
+	ceph_oid_copy(&dest->base_oid, &src->base_oid);
+	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+	ceph_oid_copy(&dest->target_oid, &src->target_oid);
+	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+	dest->pgid = src->pgid; /* struct */
+	dest->pg_num = src->pg_num;
+	dest->pg_num_mask = src->pg_num_mask;
+	ceph_osds_copy(&dest->acting, &src->acting);
+	ceph_osds_copy(&dest->up, &src->up);
+	dest->size = src->size;
+	dest->min_size = src->min_size;
+	dest->sort_bitwise = src->sort_bitwise;
+
+	dest->flags = src->flags;
+	dest->paused = src->paused;
+
+	dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+	ceph_oid_destroy(&t->base_oid);
+	ceph_oid_destroy(&t->target_oid);
+}
+
+/*
  * requests
  */
+static void request_release_checks(struct ceph_osd_request *req)
+{
+	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+	WARN_ON(!list_empty(&req->r_unsafe_item));
+	WARN_ON(req->r_osd);
+}
+
 static void ceph_osdc_release_request(struct kref *kref)
 {
 	struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
 
 	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
 	     req->r_request, req->r_reply);
-	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-	WARN_ON(!list_empty(&req->r_req_lru_item));
-	WARN_ON(!list_empty(&req->r_osd_item));
-	WARN_ON(!list_empty(&req->r_linger_item));
-	WARN_ON(!list_empty(&req->r_linger_osd_item));
-	WARN_ON(req->r_osd);
+	request_release_checks(req);
 
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_reply) {
-		ceph_msg_revoke_incoming(req->r_reply);
+	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	}
 
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
 
+	target_destroy(&req->r_t);
 	ceph_put_snap_context(req->r_snapc);
+
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
 
 void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
-	dout("%s %p (was %d)\n", __func__, req,
-	     atomic_read(&req->r_kref.refcount));
-	kref_put(&req->r_kref, ceph_osdc_release_request);
+	if (req) {
+		dout("%s %p (was %d)\n", __func__, req,
+		     atomic_read(&req->r_kref.refcount));
+		kref_put(&req->r_kref, ceph_osdc_release_request);
+	}
 }
 EXPORT_SYMBOL(ceph_osdc_put_request);
 
+static void request_init(struct ceph_osd_request *req)
+{
+	/* req only, each op is zeroed in _osd_req_op_init() */
+	memset(req, 0, sizeof(*req));
+
+	kref_init(&req->r_kref);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	RB_CLEAR_NODE(&req->r_node);
+	RB_CLEAR_NODE(&req->r_mc_node);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+
+	target_init(&req->r_t);
+}
+
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	bool mempool = req->r_mempool;
+	unsigned int num_ops = req->r_num_ops;
+	u64 snapid = req->r_snapid;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	bool linger = req->r_linger;
+	struct ceph_msg *request_msg = req->r_request;
+	struct ceph_msg *reply_msg = req->r_reply;
+
+	dout("%s req %p\n", __func__, req);
+	WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+	request_release_checks(req);
+
+	WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+	WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+	target_destroy(&req->r_t);
+
+	request_init(req);
+	req->r_osdc = osdc;
+	req->r_mempool = mempool;
+	req->r_num_ops = num_ops;
+	req->r_snapid = snapid;
+	req->r_snapc = snapc;
+	req->r_linger = linger;
+	req->r_request = request_msg;
+	req->r_reply = reply_msg;
+}
+
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
-	struct ceph_msg *msg;
-	size_t msg_size;
 
 	if (use_mempool) {
 		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	if (unlikely(!req))
 		return NULL;
 
-	/* req only, each op is zeroed in _osd_req_op_init() */
-	memset(req, 0, sizeof(*req));
-
+	request_init(req);
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
 	req->r_num_ops = num_ops;
+	req->r_snapid = CEPH_NOSNAP;
+	req->r_snapc = ceph_get_snap_context(snapc);
 
-	kref_init(&req->r_kref);
-	init_completion(&req->r_completion);
-	init_completion(&req->r_safe_completion);
-	RB_CLEAR_NODE(&req->r_node);
-	INIT_LIST_HEAD(&req->r_unsafe_item);
-	INIT_LIST_HEAD(&req->r_linger_item);
-	INIT_LIST_HEAD(&req->r_linger_osd_item);
-	INIT_LIST_HEAD(&req->r_req_lru_item);
-	INIT_LIST_HEAD(&req->r_osd_item);
-
-	req->r_base_oloc.pool = -1;
-	req->r_target_oloc.pool = -1;
+	dout("%s req %p\n", __func__, req);
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-	msg_size = OSD_OPREPLY_FRONT_LEN;
-	if (num_ops > CEPH_OSD_SLAB_OPS) {
-		/* ceph_osd_op and rval */
-		msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
-			    (sizeof(struct ceph_osd_op) + 4);
-	}
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_msg *msg;
+	int msg_size;
 
-	/* create reply message */
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-				   gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	req->r_reply = msg;
+	WARN_ON(ceph_oid_empty(&req->r_base_oid));
 
+	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4; /* pgid */
-	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
-	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+	msg_size += 4 + req->r_base_oid.name_len; /* oid */
+	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
 	msg_size += 8; /* snapid */
 	msg_size += 8; /* snap_seq */
-	msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
 	msg_size += 4; /* retry_attempt */
 
-	/* create request message; allow space for oid */
-	if (use_mempool)
+	if (req->r_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
 
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
-
 	req->r_request = msg;
 
-	return req;
+	/* create reply message */
+	msg_size = OSD_OPREPLY_FRONT_LEN;
+	msg_size += req->r_base_oid.name_len;
+	msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+	if (req->r_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
+
+	req->r_reply = msg;
+
+	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
-	op->cls.argc = 0;	/* currently unused */
-
 	op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
 
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-				unsigned int which, u16 opcode,
-				u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+				  u64 cookie, u8 watch_opcode)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-						      opcode, 0);
-
-	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+	struct ceph_osd_req_op *op;
 
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
 	op->watch.cookie = cookie;
-	op->watch.ver = version;
-	if (opcode == CEPH_OSD_OP_WATCH && flag)
-		op->watch.flag = (u8)1;
+	op->watch.op = watch_opcode;
+	op->watch.gen = 0;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 	}
 }
 
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+			     const struct ceph_osd_req_op *src)
 {
-	struct ceph_osd_req_op *src;
-	struct ceph_osd_data *osd_data;
-	u64 request_data_len = 0;
-	u64 data_length;
-
-	BUG_ON(which >= req->r_num_ops);
-	src = &req->r_ops[which];
 	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
 		pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 	switch (src->op) {
 	case CEPH_OSD_OP_STAT:
-		osd_data = &src->raw_data_in;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_TRUNCATE:
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			request_data_len = src->extent.length;
 		dst->extent.offset = cpu_to_le64(src->extent.offset);
 		dst->extent.length = cpu_to_le64(src->extent.length);
 		dst->extent.truncate_size =
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
-		osd_data = &src->extent.osd_data;
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-		else
-			ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
-		osd_data = &src->cls.request_info;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-		request_data_len = osd_data->pagelist->length;
-
-		osd_data = &src->cls.request_data;
-		data_length = ceph_osd_data_length(osd_data);
-		if (data_length) {
-			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-			dst->cls.indata_len = cpu_to_le32(data_length);
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-			src->indata_len += data_length;
-			request_data_len += data_length;
-		}
-		osd_data = &src->cls.response_data;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_WATCH:
 		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-		dst->watch.ver = cpu_to_le64(src->watch.ver);
-		dst->watch.flag = src->watch.flag;
+		dst->watch.ver = cpu_to_le64(0);
+		dst->watch.op = src->watch.op;
+		dst->watch.gen = cpu_to_le32(src->watch.gen);
+		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+		break;
+	case CEPH_OSD_OP_NOTIFY:
+		dst->notify.cookie = cpu_to_le64(src->notify.cookie);
 		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
 		dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
 		dst->xattr.cmp_op = src->xattr.cmp_op;
 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		osd_data = &src->xattr.osd_data;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		request_data_len = osd_data->pagelist->length;
 		break;
 	case CEPH_OSD_OP_CREATE:
 	case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 	dst->flags = cpu_to_le32(src->flags);
 	dst->payload_len = cpu_to_le32(src->indata_len);
 
-	return request_data_len;
+	return src->indata_len;
 }
 
 /*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
-
-	req->r_flags = flags;
+	if (!req) {
+		r = -ENOMEM;
+		goto fail;
+	}
 
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-	if (r < 0) {
-		ceph_osdc_put_request(req);
-		return ERR_PTR(r);
-	}
+	if (r)
+		goto fail;
 
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
 		osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
+	req->r_flags = flags;
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
+
+	req->r_snapid = vino.snap;
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		req->r_data_offset = off;
 
-	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
-		 "%llx.%08llx", vino.ino, objnum);
-	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (r)
+		goto fail;
 
 	return req;
+
+fail:
+	ceph_osdc_put_request(req);
+	return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-static void __insert_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *new)
-{
-	struct rb_node **p = &osdc->requests.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_osd_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &osdc->requests);
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
 
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-						 u64 tid)
+static bool osd_homeless(struct ceph_osd *osd)
 {
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
+	return osd->o_osd == CEPH_HOMELESS_OSD;
 }
 
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-		    u64 tid)
+static bool osd_registered(struct ceph_osd *osd)
 {
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid) {
-			if (!n->rb_left)
-				return req;
-			n = n->rb_left;
-		} else if (tid > req->r_tid) {
-			n = n->rb_right;
-		} else {
-			return req;
-		}
-	}
-	return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-	struct ceph_osd *osd = req->r_osd;
+	verify_osdc_locked(osd->o_osdc);
 
-	/*
-	 * Linger requests need to be resent with a new tid to avoid
-	 * the dup op detection logic on the OSDs.  Achieve this with
-	 * a re-register dance instead of open-coding.
-	 */
-	ceph_osdc_get_request(req);
-	if (!list_empty(&req->r_linger_item))
-		__unregister_linger_request(osdc, req);
-	else
-		__unregister_request(osdc, req);
-	__register_request(osdc, req);
-	ceph_osdc_put_request(req);
-
-	/*
-	 * Unless request has been registered as both normal and
-	 * lingering, __unregister{,_linger}_request clears r_osd.
-	 * However, here we need to preserve r_osd to make sure we
-	 * requeue on the same OSD.
-	 */
-	WARN_ON(req->r_osd || !osd);
-	req->r_osd = osd;
-
-	dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
-	__enqueue_request(req);
+	return !RB_EMPTY_NODE(&osd->o_node);
 }
 
 /*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
  */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
-				struct ceph_osd *osd)
+static void osd_init(struct ceph_osd *osd)
 {
-	struct ceph_osd_request *req, *nreq;
-	LIST_HEAD(resend);
-	LIST_HEAD(resend_linger);
-	int err;
-
-	dout("%s osd%d\n", __func__, osd->o_osd);
-	err = __reset_osd(osdc, osd);
-	if (err)
-		return;
-
-	/*
-	 * Build up a list of requests to resend by traversing the
-	 * osd's list of requests.  Requests for a given object are
-	 * sent in tid order, and that is also the order they're
-	 * kept on this list.  Therefore all requests that are in
-	 * flight will be found first, followed by all requests that
-	 * have not yet been sent.  And to resend requests while
-	 * preserving this order we will want to put any sent
-	 * requests back on the front of the osd client's unsent
-	 * list.
-	 *
-	 * So we build a separate ordered list of already-sent
-	 * requests for the affected osd and splice it onto the
-	 * front of the osd client's unsent list.  Once we've seen a
-	 * request that has not yet been sent we're done.  Those
-	 * requests are already sitting right where they belong.
-	 */
-	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-		if (!req->r_sent)
-			break;
-
-		if (!req->r_linger) {
-			dout("%s requeueing %p tid %llu\n", __func__, req,
-			     req->r_tid);
-			list_move_tail(&req->r_req_lru_item, &resend);
-			req->r_flags |= CEPH_OSD_FLAG_RETRY;
-		} else {
-			list_move_tail(&req->r_req_lru_item, &resend_linger);
-		}
-	}
-	list_splice(&resend, &osdc->req_unsent);
-
-	/*
-	 * Both registered and not yet registered linger requests are
-	 * enqueued with a new tid on the same OSD.  We add/move them
-	 * to req_unsent/o_requests at the end to keep things in tid
-	 * order.
-	 */
-	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-				 r_linger_osd_item) {
-		WARN_ON(!list_empty(&req->r_req_lru_item));
-		__kick_linger_request(req);
-	}
-
-	list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-		__kick_linger_request(req);
+	atomic_set(&osd->o_ref, 1);
+	RB_CLEAR_NODE(&osd->o_node);
+	osd->o_requests = RB_ROOT;
+	osd->o_linger_requests = RB_ROOT;
+	INIT_LIST_HEAD(&osd->o_osd_lru);
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
+	osd->o_incarnation = 1;
+	mutex_init(&osd->lock);
 }
 
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static void osd_cleanup(struct ceph_osd *osd)
 {
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-
-	if (!osd)
-		return;
-	dout("osd_reset osd%d\n", osd->o_osd);
-	osdc = osd->o_osdc;
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	__kick_osd_requests(osdc, osd);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+	WARN_ON(!list_empty(&osd->o_osd_lru));
+	WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+	if (osd->o_auth.authorizer) {
+		WARN_ON(osd_homeless(osd));
+		ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+	}
 }
 
 /*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 {
 	struct ceph_osd *osd;
 
-	osd = kzalloc(sizeof(*osd), GFP_NOFS);
-	if (!osd)
-		return NULL;
+	WARN_ON(onum == CEPH_HOMELESS_OSD);
 
-	atomic_set(&osd->o_ref, 1);
+	osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+	osd_init(osd);
 	osd->o_osdc = osdc;
 	osd->o_osd = onum;
-	RB_CLEAR_NODE(&osd->o_node);
-	INIT_LIST_HEAD(&osd->o_requests);
-	INIT_LIST_HEAD(&osd->o_linger_requests);
-	INIT_LIST_HEAD(&osd->o_osd_lru);
-	osd->o_incarnation = 1;
 
 	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
 
-	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	return osd;
 }
 
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
 	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
 	     atomic_read(&osd->o_ref) - 1);
 	if (atomic_dec_and_test(&osd->o_ref)) {
-		if (osd->o_auth.authorizer)
-			ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+		osd_cleanup(osd);
 		kfree(osd);
 	}
 }
 
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-	WARN_ON(!list_empty(&osd->o_requests));
-	WARN_ON(!list_empty(&osd->o_linger_requests));
-
-	list_del_init(&osd->o_osd_lru);
-	rb_erase(&osd->o_node, &osdc->osds);
-	RB_CLEAR_NODE(&osd->o_node);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
-	if (!RB_EMPTY_NODE(&osd->o_node)) {
-		ceph_con_close(&osd->o_con);
-		__remove_osd(osdc, osd);
-		put_osd(osd);
-	}
-}
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
 
-static void remove_all_osds(struct ceph_osd_client *osdc)
+static void __move_osd_to_lru(struct ceph_osd *osd)
 {
-	dout("%s %p\n", __func__, osdc);
-	mutex_lock(&osdc->request_mutex);
-	while (!RB_EMPTY_ROOT(&osdc->osds)) {
-		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
-						struct ceph_osd, o_node);
-		remove_osd(osdc, osd);
-	}
-	mutex_unlock(&osdc->request_mutex);
-}
+	struct ceph_osd_client *osdc = osd->o_osdc;
 
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-			      struct ceph_osd *osd)
-{
-	dout("%s %p\n", __func__, osd);
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 
+	spin_lock(&osdc->osd_lru_lock);
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	spin_unlock(&osdc->osd_lru_lock);
+
 	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
-				  struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-	dout("%s %p\n", __func__, osd);
-
-	if (list_empty(&osd->o_requests) &&
-	    list_empty(&osd->o_linger_requests))
-		__move_osd_to_lru(osdc, osd);
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
+	    RB_EMPTY_ROOT(&osd->o_linger_requests))
+		__move_osd_to_lru(osd);
 }
 
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-	dout("__remove_osd_from_lru %p\n", osd);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	spin_lock(&osdc->osd_lru_lock);
 	if (!list_empty(&osd->o_osd_lru))
 		list_del_init(&osd->o_osd_lru);
+	spin_unlock(&osdc->osd_lru_lock);
 }
 
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
 {
-	struct ceph_osd *osd, *nosd;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
 
-	dout("__remove_old_osds %p\n", osdc);
-	mutex_lock(&osdc->request_mutex);
-	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-		if (time_before(jiffies, osd->lru_ttl))
-			break;
-		remove_osd(osdc, osd);
+	verify_osdc_wrlocked(osdc);
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	ceph_con_close(&osd->o_con);
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+		unlink_request(osd, req);
+		link_request(&osdc->homeless_osd, req);
+	}
+	for (n = rb_first(&osd->o_linger_requests); n; ) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		n = rb_next(n); /* unlink_linger() */
+
+		dout(" reassigning lreq %p linger_id %llu\n", lreq,
+		     lreq->linger_id);
+		unlink_linger(osd, lreq);
+		link_linger(&osdc->homeless_osd, lreq);
 	}
-	mutex_unlock(&osdc->request_mutex);
+
+	__remove_osd_from_lru(osd);
+	erase_osd(&osdc->osds, osd);
+	put_osd(osd);
 }
 
 /*
  * reset osd connect
  */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
 	struct ceph_entity_addr *peer_addr;
 
-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-	if (list_empty(&osd->o_requests) &&
-	    list_empty(&osd->o_linger_requests)) {
-		remove_osd(osdc, osd);
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
+	    RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+		close_osd(osd);
 		return -ENODEV;
 	}
 
-	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
 	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
 			!ceph_con_opened(&osd->o_con)) {
-		struct ceph_osd_request *req;
+		struct rb_node *n;
 
 		dout("osd addr hasn't changed and connection never opened, "
 		     "letting msgr retry\n");
 		/* touch each r_stamp for handle_timeout()'s benfit */
-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+		for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+			struct ceph_osd_request *req =
+			    rb_entry(n, struct ceph_osd_request, r_node);
 			req->r_stamp = jiffies;
+		}
 
 		return -EAGAIN;
 	}
@@ -1206,455 +1170,1369 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+					  bool wrlocked)
 {
-	struct rb_node **p = &osdc->osds.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd *osd = NULL;
+	struct ceph_osd *osd;
 
-	dout("__insert_osd %p osd%d\n", new, new->o_osd);
-	while (*p) {
-		parent = *p;
-		osd = rb_entry(parent, struct ceph_osd, o_node);
-		if (new->o_osd < osd->o_osd)
-			p = &(*p)->rb_left;
-		else if (new->o_osd > osd->o_osd)
-			p = &(*p)->rb_right;
-		else
-			BUG();
+	if (wrlocked)
+		verify_osdc_wrlocked(osdc);
+	else
+		verify_osdc_locked(osdc);
+
+	if (o != CEPH_HOMELESS_OSD)
+		osd = lookup_osd(&osdc->osds, o);
+	else
+		osd = &osdc->homeless_osd;
+	if (!osd) {
+		if (!wrlocked)
+			return ERR_PTR(-EAGAIN);
+
+		osd = create_osd(osdc, o);
+		insert_osd(&osdc->osds, osd);
+		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+			      &osdc->osdmap->osd_addr[osd->o_osd]);
 	}
 
-	rb_link_node(&new->o_node, parent, p);
-	rb_insert_color(&new->o_node, &osdc->osds);
+	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+	return osd;
 }
 
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-	struct ceph_osd *osd;
-	struct rb_node *n = osdc->osds.rb_node;
-
-	while (n) {
-		osd = rb_entry(n, struct ceph_osd, o_node);
-		if (o < osd->o_osd)
-			n = n->rb_left;
-		else if (o > osd->o_osd)
-			n = n->rb_right;
-		else
-			return osd;
-	}
-	return NULL;
+	verify_osd_locked(osd);
+	WARN_ON(!req->r_tid || req->r_osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_request(&osd->o_requests, req);
+	req->r_osd = osd;
 }
 
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-	schedule_delayed_work(&osdc->timeout_work,
-			      osdc->client->options->osd_keepalive_timeout);
+	verify_osd_locked(osd);
+	WARN_ON(req->r_osd != osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	req->r_osd = NULL;
+	erase_request(&osd->o_requests, req);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
 }
 
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
-	cancel_delayed_work(&osdc->timeout_work);
+	return pi->flags & CEPH_POOL_FLAG_FULL;
 }
 
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req)
+static bool have_pool_full(struct ceph_osd_client *osdc)
 {
-	req->r_tid = ++osdc->last_tid;
-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	__insert_request(osdc, req);
-	ceph_osdc_get_request(req);
-	osdc->num_requests++;
-	if (osdc->num_requests == 1) {
-		dout(" first request, scheduling timeout\n");
-		__schedule_osd_timeout(osdc);
+	struct rb_node *n;
+
+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pi =
+		    rb_entry(n, struct ceph_pg_pool_info, node);
+
+		if (__pool_full(pi))
+			return true;
 	}
+
+	return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+	struct ceph_pg_pool_info *pi;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
+
+	return __pool_full(pi);
 }
 
 /*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
  */
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req)
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+				    const struct ceph_osd_request_target *t,
+				    struct ceph_pg_pool_info *pi)
 {
-	if (RB_EMPTY_NODE(&req->r_node)) {
-		dout("__unregister_request %p tid %lld not registered\n",
-			req, req->r_tid);
-		return;
+	bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+	bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+		       ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+		       __pool_full(pi);
+
+	WARN_ON(pi->id != t->base_oloc.pool);
+	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+enum calc_target_result {
+	CALC_TARGET_NO_ACTION = 0,
+	CALC_TARGET_NEED_RESEND,
+	CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+					   struct ceph_osd_request_target *t,
+					   u32 *last_force_resend,
+					   bool any_change)
+{
+	struct ceph_pg_pool_info *pi;
+	struct ceph_pg pgid, last_pgid;
+	struct ceph_osds up, acting;
+	bool force_resend = false;
+	bool need_check_tiering = false;
+	bool need_resend = false;
+	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+	enum calc_target_result ct_res;
+	int ret;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+	if (!pi) {
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
 	}
 
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	rb_erase(&req->r_node, &osdc->requests);
-	RB_CLEAR_NODE(&req->r_node);
-	osdc->num_requests--;
+	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+		if (last_force_resend &&
+		    *last_force_resend < pi->last_force_request_resend) {
+			*last_force_resend = pi->last_force_request_resend;
+			force_resend = true;
+		} else if (!last_force_resend) {
+			force_resend = true;
+		}
+	}
+	if (ceph_oid_empty(&t->target_oid) || force_resend) {
+		ceph_oid_copy(&t->target_oid, &t->base_oid);
+		need_check_tiering = true;
+	}
+	if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+		ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+		need_check_tiering = true;
+	}
 
-	if (req->r_osd) {
-		/* make sure the original request isn't in flight. */
-		ceph_msg_revoke(req->r_request);
+	if (need_check_tiering &&
+	    (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+			t->target_oloc.pool = pi->read_tier;
+		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+			t->target_oloc.pool = pi->write_tier;
+	}
 
-		list_del_init(&req->r_osd_item);
-		maybe_move_osd_to_lru(osdc, req->r_osd);
-		if (list_empty(&req->r_linger_osd_item))
-			req->r_osd = NULL;
+	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+					&t->target_oloc, &pgid);
+	if (ret) {
+		WARN_ON(ret != -ENOENT);
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+	last_pgid.pool = pgid.pool;
+	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+	if (any_change &&
+	    ceph_is_new_interval(&t->acting,
+				 &acting,
+				 &t->up,
+				 &up,
+				 t->size,
+				 pi->size,
+				 t->min_size,
+				 pi->min_size,
+				 t->pg_num,
+				 pi->pg_num,
+				 t->sort_bitwise,
+				 sort_bitwise,
+				 &last_pgid))
+		force_resend = true;
+
+	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+		t->paused = false;
+		need_resend = true;
 	}
 
-	list_del_init(&req->r_req_lru_item);
-	ceph_osdc_put_request(req);
+	if (ceph_pg_compare(&t->pgid, &pgid) ||
+	    ceph_osds_changed(&t->acting, &acting, any_change) ||
+	    force_resend) {
+		t->pgid = pgid; /* struct */
+		ceph_osds_copy(&t->acting, &acting);
+		ceph_osds_copy(&t->up, &up);
+		t->size = pi->size;
+		t->min_size = pi->min_size;
+		t->pg_num = pi->pg_num;
+		t->pg_num_mask = pi->pg_num_mask;
+		t->sort_bitwise = sort_bitwise;
+
+		t->osd = acting.primary;
+		need_resend = true;
+	}
+
+	ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+	dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+	return ct_res;
+}
+
+static void setup_request_data(struct ceph_osd_request *req,
+			       struct ceph_msg *msg)
+{
+	u32 data_len = 0;
+	int i;
+
+	if (!list_empty(&msg->data))
+		return;
+
+	WARN_ON(msg->data_length);
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		switch (op->op) {
+		/* request */
+		case CEPH_OSD_OP_WRITE:
+		case CEPH_OSD_OP_WRITEFULL:
+			WARN_ON(op->indata_len != op->extent.length);
+			ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+			break;
+		case CEPH_OSD_OP_SETXATTR:
+		case CEPH_OSD_OP_CMPXATTR:
+			WARN_ON(op->indata_len != op->xattr.name_len +
+						  op->xattr.value_len);
+			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+			break;
+		case CEPH_OSD_OP_NOTIFY_ACK:
+			ceph_osdc_msg_data_add(msg,
+					       &op->notify_ack.request_data);
+			break;
+
+		/* reply */
+		case CEPH_OSD_OP_STAT:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->raw_data_in);
+			break;
+		case CEPH_OSD_OP_READ:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->extent.osd_data);
+			break;
+
+		/* both */
+		case CEPH_OSD_OP_CALL:
+			WARN_ON(op->indata_len != op->cls.class_len +
+						  op->cls.method_len +
+						  op->cls.indata_len);
+			ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->cls.response_data);
+			break;
+		case CEPH_OSD_OP_NOTIFY:
+			ceph_osdc_msg_data_add(msg,
+					       &op->notify.request_data);
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->notify.response_data);
+			break;
+		}
+
+		data_len += op->indata_len;
+	}
+
+	WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front_alloc_len;
+	u32 data_len = 0;
+	int i;
+
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+		/* snapshots aren't writeable */
+		WARN_ON(req->r_snapid != CEPH_NOSNAP);
+	} else {
+		WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+			req->r_data_offset || req->r_snapc);
+	}
+
+	setup_request_data(req, msg);
+
+	ceph_encode_32(&p, 1); /* client_inc, always 1 */
+	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+	ceph_encode_32(&p, req->r_flags);
+	ceph_encode_timespec(p, &req->r_mtime);
+	p += sizeof(struct ceph_timespec);
+	/* aka reassert_version */
+	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+	p += sizeof(req->r_replay_version);
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	ceph_encode_64(&p, req->r_t.target_oloc.pool);
+	ceph_encode_32(&p, -1); /* preferred */
+	ceph_encode_32(&p, 0); /* key len */
+
+	/* pgid */
+	ceph_encode_8(&p, 1);
+	ceph_encode_64(&p, req->r_t.pgid.pool);
+	ceph_encode_32(&p, req->r_t.pgid.seed);
+	ceph_encode_32(&p, -1); /* preferred */
+
+	/* oid */
+	ceph_encode_32(&p, req->r_t.target_oid.name_len);
+	memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+	p += req->r_t.target_oid.name_len;
 
-	if (osdc->num_requests == 0) {
-		dout(" no requests, canceling timeout\n");
-		__cancel_osd_timeout(osdc);
+	/* ops, can imply data */
+	ceph_encode_16(&p, req->r_num_ops);
+	for (i = 0; i < req->r_num_ops; i++) {
+		data_len += osd_req_encode_op(p, &req->r_ops[i]);
+		p += sizeof(struct ceph_osd_op);
 	}
+
+	ceph_encode_64(&p, req->r_snapid); /* snapid */
+	if (req->r_snapc) {
+		ceph_encode_64(&p, req->r_snapc->seq);
+		ceph_encode_32(&p, req->r_snapc->num_snaps);
+		for (i = 0; i < req->r_snapc->num_snaps; i++)
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
+	} else {
+		ceph_encode_64(&p, 0); /* snap_seq */
+		ceph_encode_32(&p, 0); /* snaps len */
+	}
+
+	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	msg->hdr.data_len = cpu_to_le32(data_len);
+	/*
+	 * The header "data_off" is a hint to the receiver allowing it
+	 * to align received data into its buffers such that there's no
+	 * need to re-copy it before writing it to disk (direct I/O).
+	 */
+	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+	dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__,
+	     req, req->r_t.target_oid.name, req->r_t.target_oid.name_len,
+	     msg->front.iov_len, data_len);
 }
 
 /*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
  */
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
 {
-	if (req->r_sent && req->r_osd) {
+	struct ceph_osd *osd = req->r_osd;
+
+	verify_osd_locked(osd);
+	WARN_ON(osd->o_osd != req->r_t.osd);
+
+	/*
+	 * We may have a previously queued request message hanging
+	 * around.  Cancel it to avoid corrupting the msgr.
+	 */
+	if (req->r_sent)
 		ceph_msg_revoke(req->r_request);
-		req->r_sent = 0;
+
+	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+	if (req->r_attempts)
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+	else
+		WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+	encode_request(req, req->r_request);
+
+	dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+	     req->r_t.osd, req->r_flags, req->r_attempts);
+
+	req->r_t.paused = false;
+	req->r_stamp = jiffies;
+	req->r_attempts++;
+
+	req->r_sent = osd->o_incarnation;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+	bool continuous = false;
+
+	verify_osdc_locked(osdc);
+	WARN_ON(!osdc->osdmap->epoch);
+
+	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+		dout("%s osdc %p continuous\n", __func__, osdc);
+		continuous = true;
+	} else {
+		dout("%s osdc %p onetime\n", __func__, osdc);
 	}
+
+	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+			       osdc->osdmap->epoch + 1, continuous))
+		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
-static void __register_linger_request(struct ceph_osd_client *osdc,
-				    struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-	WARN_ON(!req->r_linger);
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd;
+	enum calc_target_result ct_res;
+	bool need_send = false;
+	bool promoted = false;
+
+	WARN_ON(req->r_tid || req->r_got_reply);
+	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+	ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+		goto promote;
+
+	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+	if (IS_ERR(osd)) {
+		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+		goto promote;
+	}
+
+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+		dout("req %p pausewr\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+		   ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+		dout("req %p pauserd\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+		   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+				     CEPH_OSD_FLAG_FULL_FORCE)) &&
+		   (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+		    pool_full(osdc, req->r_t.base_oloc.pool))) {
+		dout("req %p full/pool_full\n", req);
+		pr_warn_ratelimited("FULL or reached pool quota\n");
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if (!osd_homeless(osd)) {
+		need_send = true;
+	} else {
+		maybe_request_map(osdc);
+	}
+
+	mutex_lock(&osd->lock);
+	/*
+	 * Assign the tid atomically with send_request() to protect
+	 * multiple writes to the same object from racing with each
+	 * other, resulting in out of order ops on the OSDs.
+	 */
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(osd, req);
+	if (need_send)
+		send_request(req);
+	mutex_unlock(&osd->lock);
 
+	if (ct_res == CALC_TARGET_POOL_DNE)
+		send_map_check(req);
+
+	if (promoted)
+		downgrade_write(&osdc->lock);
+	return;
+
+promote:
+	up_read(&osdc->lock);
+	down_write(&osdc->lock);
+	wrlocked = true;
+	promoted = true;
+	goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+	if (req->r_flags & CEPH_OSD_FLAG_READ) {
+		WARN_ON(req->r_flags & mask);
+		req->r_flags |= CEPH_OSD_FLAG_ACK;
+	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		WARN_ON(!(req->r_flags & mask));
+	else
+		WARN_ON(1);
+
+	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+	atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
 	ceph_osdc_get_request(req);
-	list_add_tail(&req->r_linger_item, &osdc->req_linger);
-	if (req->r_osd)
-		list_add_tail(&req->r_linger_osd_item,
-			      &req->r_osd->o_linger_requests);
+	account_request(req);
+	__submit_request(req, wrlocked);
 }
 
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
 {
-	WARN_ON(!req->r_linger);
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd = req->r_osd;
 
-	if (list_empty(&req->r_linger_item)) {
-		dout("%s %p tid %llu not registered\n", __func__, req,
-		     req->r_tid);
+	verify_osd_locked(osd);
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+	unlink_request(osd, req);
+	atomic_dec(&osdc->num_requests);
+
+	/*
+	 * If an OSD has failed or returned and a request has been sent
+	 * twice, it's possible to get a reply and end up here while the
+	 * request message is queued for delivery.  We will ignore the
+	 * reply, so not a big deal, but better to try and catch it.
+	 */
+	ceph_msg_revoke(req->r_request);
+	ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+	__finish_request(req);
+	ceph_osdc_put_request(req);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(req);
+	else
+		complete_all(&req->r_completion);
+}
+
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+	req->r_result = err;
+	__finish_request(req);
+	__complete_request(req);
+	complete_all(&req->r_safe_completion);
+	ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_request *lookup_req;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+	if (!lookup_req)
 		return;
+
+	WARN_ON(lookup_req != req);
+	erase_request_mc(&osdc->map_checks, req);
+	ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	cancel_map_check(req);
+	finish_request(req);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(!map->epoch);
+
+	if (req->r_attempts) {
+		/*
+		 * We sent a request earlier, which means that
+		 * previously the pool existed, and now it does not
+		 * (i.e., it was deleted).
+		 */
+		req->r_map_dne_bound = map->epoch;
+		dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+		     req->r_tid);
+	} else {
+		dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+		     req, req->r_tid, req->r_map_dne_bound, map->epoch);
+	}
+
+	if (req->r_map_dne_bound) {
+		if (map->epoch >= req->r_map_dne_bound) {
+			/* we had a new enough map */
+			pr_info_ratelimited("tid %llu pool does not exist\n",
+					    req->r_tid);
+			complete_request(req, -ENOENT);
+		}
+	} else {
+		send_map_check(req);
 	}
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+	struct ceph_osd_request *req;
+	u64 tid = greq->private_data;
 
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-	list_del_init(&req->r_linger_item);
+	WARN_ON(greq->result || !greq->u.newest);
 
-	if (req->r_osd) {
-		list_del_init(&req->r_linger_osd_item);
-		maybe_move_osd_to_lru(osdc, req->r_osd);
-		if (list_empty(&req->r_osd_item))
-			req->r_osd = NULL;
+	down_write(&osdc->lock);
+	req = lookup_request_mc(&osdc->map_checks, tid);
+	if (!req) {
+		dout("%s tid %llu dne\n", __func__, tid);
+		goto out_unlock;
 	}
+
+	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+	     req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+	if (!req->r_map_dne_bound)
+		req->r_map_dne_bound = greq->u.newest;
+	erase_request_mc(&osdc->map_checks, req);
+	check_pool_dne(req);
+
 	ceph_osdc_put_request(req);
+out_unlock:
+	up_write(&osdc->lock);
 }
 
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-				  struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req)
 {
-	if (!req->r_linger) {
-		dout("set_request_linger %p\n", req);
-		req->r_linger = 1;
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_request *lookup_req;
+	int ret;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+	if (lookup_req) {
+		WARN_ON(lookup_req != req);
+		return;
 	}
+
+	ceph_osdc_get_request(req);
+	insert_request_mc(&osdc->map_checks, req);
+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+					  map_check_cb, req->r_tid);
+	WARN_ON(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 
 /*
- * Returns whether a request should be blocked from being sent
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
+ * lingering requests, watch/notify v2 infrastructure
  */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req)
+static void linger_release(struct kref *kref)
+{
+	struct ceph_osd_linger_request *lreq =
+	    container_of(kref, struct ceph_osd_linger_request, kref);
+
+	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+	     lreq->reg_req, lreq->ping_req);
+	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+	WARN_ON(!list_empty(&lreq->scan_item));
+	WARN_ON(!list_empty(&lreq->pending_lworks));
+	WARN_ON(lreq->osd);
+
+	if (lreq->reg_req)
+		ceph_osdc_put_request(lreq->reg_req);
+	if (lreq->ping_req)
+		ceph_osdc_put_request(lreq->ping_req);
+	target_destroy(&lreq->t);
+	kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
 {
-	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-	return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
-		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+	if (lreq)
+		kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+	kref_get(&lreq->kref);
+	return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_linger_request *lreq;
+
+	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+	if (!lreq)
+		return NULL;
+
+	kref_init(&lreq->kref);
+	mutex_init(&lreq->lock);
+	RB_CLEAR_NODE(&lreq->node);
+	RB_CLEAR_NODE(&lreq->osdc_node);
+	RB_CLEAR_NODE(&lreq->mc_node);
+	INIT_LIST_HEAD(&lreq->scan_item);
+	INIT_LIST_HEAD(&lreq->pending_lworks);
+	init_completion(&lreq->reg_commit_wait);
+	init_completion(&lreq->notify_finish_wait);
+
+	lreq->osdc = osdc;
+	target_init(&lreq->t);
+
+	dout("%s lreq %p\n", __func__, lreq);
+	return lreq;
 }
 
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
 /*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
  */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
-			     struct ceph_osd_request *req,
-			     struct ceph_pg *pg_out)
+static void link_linger(struct ceph_osd *osd,
+			struct ceph_osd_linger_request *lreq)
 {
-	bool need_check_tiering;
+	verify_osd_locked(osd);
+	WARN_ON(!lreq->linger_id || lreq->osd);
+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+	     osd->o_osd, lreq, lreq->linger_id);
 
-	need_check_tiering = false;
-	if (req->r_target_oloc.pool == -1) {
-		req->r_target_oloc = req->r_base_oloc; /* struct */
-		need_check_tiering = true;
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_linger(&osd->o_linger_requests, lreq);
+	lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+			  struct ceph_osd_linger_request *lreq)
+{
+	verify_osd_locked(osd);
+	WARN_ON(lreq->osd != osd);
+	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+	     osd->o_osd, lreq, lreq->linger_id);
+
+	lreq->osd = NULL;
+	erase_linger(&osd->o_linger_requests, lreq);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+	verify_osdc_locked(lreq->osdc);
+
+	return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	bool registered;
+
+	down_read(&osdc->lock);
+	registered = __linger_registered(lreq);
+	up_read(&osdc->lock);
+
+	return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(lreq->linger_id);
+
+	linger_get(lreq);
+	lreq->linger_id = ++osdc->last_linger_id;
+	insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_osdc_wrlocked(osdc);
+
+	erase_linger_osdc(&osdc->linger_requests, lreq);
+	linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	WARN_ON(!req->r_linger);
+	cancel_request(req);
+	linger_put(lreq);
+}
+
+struct linger_work {
+	struct work_struct work;
+	struct ceph_osd_linger_request *lreq;
+	struct list_head pending_item;
+	unsigned long queued_stamp;
+
+	union {
+		struct {
+			u64 notify_id;
+			u64 notifier_id;
+			void *payload; /* points into @msg front */
+			size_t payload_len;
+
+			struct ceph_msg *msg; /* for ceph_msg_put() */
+		} notify;
+		struct {
+			int err;
+		} error;
+	};
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+				       work_func_t workfn)
+{
+	struct linger_work *lwork;
+
+	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+	if (!lwork)
+		return NULL;
+
+	INIT_WORK(&lwork->work, workfn);
+	INIT_LIST_HEAD(&lwork->pending_item);
+	lwork->lreq = linger_get(lreq);
+
+	return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	mutex_lock(&lreq->lock);
+	list_del(&lwork->pending_item);
+	mutex_unlock(&lreq->lock);
+
+	linger_put(lreq);
+	kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+	struct ceph_osd_client *osdc = lreq->osdc;
+
+	verify_lreq_locked(lreq);
+	WARN_ON(!list_empty(&lwork->pending_item));
+
+	lwork->queued_stamp = jiffies;
+	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+	queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+	struct linger_work *lwork = container_of(w, struct linger_work, work);
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	if (!linger_registered(lreq)) {
+		dout("%s lreq %p not registered\n", __func__, lreq);
+		goto out;
 	}
-	if (req->r_target_oid.name_len == 0) {
-		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
-		need_check_tiering = true;
+
+	WARN_ON(!lreq->is_watch);
+	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+	     __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+	     lwork->notify.payload_len);
+	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+		  lwork->notify.notifier_id, lwork->notify.payload,
+		  lwork->notify.payload_len);
+
+out:
+	ceph_msg_put(lwork->notify.msg);
+	lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+	struct linger_work *lwork = container_of(w, struct linger_work, work);
+	struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+	if (!linger_registered(lreq)) {
+		dout("%s lreq %p not registered\n", __func__, lreq);
+		goto out;
 	}
 
-	if (need_check_tiering &&
-	    (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-		struct ceph_pg_pool_info *pi;
-
-		pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
-		if (pi) {
-			if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-			    pi->read_tier >= 0)
-				req->r_target_oloc.pool = pi->read_tier;
-			if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-			    pi->write_tier >= 0)
-				req->r_target_oloc.pool = pi->write_tier;
+	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+	lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+	struct linger_work *lwork;
+
+	lwork = lwork_alloc(lreq, do_watch_error);
+	if (!lwork) {
+		pr_err("failed to allocate error-lwork\n");
+		return;
+	}
+
+	lwork->error.err = lreq->last_error;
+	lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+				       int result)
+{
+	if (!completion_done(&lreq->reg_commit_wait)) {
+		lreq->reg_commit_error = (result <= 0 ? result : 0);
+		complete_all(&lreq->reg_commit_wait);
+	}
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+	     lreq->linger_id, req->r_result);
+	WARN_ON(!__linger_registered(lreq));
+	linger_reg_commit_complete(lreq, req->r_result);
+	lreq->committed = true;
+
+	if (!lreq->is_watch) {
+		struct ceph_osd_data *osd_data =
+		    osd_req_op_data(req, 0, notify, response_data);
+		void *p = page_address(osd_data->pages[0]);
+
+		WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+			osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+		/* make note of the notify_id */
+		if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+			lreq->notify_id = ceph_decode_64(&p);
+			dout("lreq %p notify_id %llu\n", lreq,
+			     lreq->notify_id);
+		} else {
+			dout("lreq %p no notify_id\n", lreq);
 		}
-		/* !pi is caught in ceph_oloc_oid_to_pg() */
 	}
 
-	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
-				   &req->r_target_oid, pg_out);
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
 }
 
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
 {
-	struct ceph_osd_client *osdc = req->r_osdc;
+	/*
+	 * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+	 * notification and a failure to reconnect because we raced with
+	 * the delete appear the same to the user.
+	 */
+	if (err == -ENOENT)
+		err = -ENOTCONN;
+
+	return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+	     lreq, lreq->linger_id, req->r_result, lreq->last_error);
+	if (req->r_result < 0) {
+		if (!lreq->last_error) {
+			lreq->last_error = normalize_watch_error(req->r_result);
+			queue_watch_error(lreq);
+		}
+	}
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_request *req = lreq->reg_req;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
 
-	dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
-	     req->r_osd ? req->r_osd->o_osd : -1);
+	verify_osdc_wrlocked(req->r_osdc);
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
 
-	if (req->r_osd) {
-		__remove_osd_from_lru(req->r_osd);
-		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+	if (req->r_osd)
+		cancel_linger_request(req);
+
+	request_reinit(req);
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+	req->r_flags = lreq->t.flags;
+	req->r_mtime = lreq->mtime;
+
+	mutex_lock(&lreq->lock);
+	if (lreq->is_watch && lreq->committed) {
+		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+			op->watch.cookie != lreq->linger_id);
+		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+		op->watch.gen = ++lreq->register_gen;
+		dout("lreq %p reconnect register_gen %u\n", lreq,
+		     op->watch.gen);
+		req->r_callback = linger_reconnect_cb;
 	} else {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+		if (!lreq->is_watch)
+			lreq->notify_id = 0;
+		else
+			WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+		dout("lreq %p register\n", lreq);
+		req->r_callback = linger_commit_cb;
 	}
+	mutex_unlock(&lreq->lock);
+
+	req->r_priv = linger_get(lreq);
+	req->r_linger = true;
+
+	submit_request(req, true);
 }
 
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-			 struct ceph_osd_request *req, int force_resend)
+static void linger_ping_cb(struct ceph_osd_request *req)
 {
-	struct ceph_pg pgid;
-	int acting[CEPH_PG_MAX_SIZE];
-	int num, o;
-	int err;
-	bool was_paused;
-
-	dout("map_request %p tid %lld\n", req, req->r_tid);
-
-	err = __calc_request_pg(osdc->osdmap, req, &pgid);
-	if (err) {
-		list_move(&req->r_req_lru_item, &osdc->req_notarget);
-		return err;
-	}
-	req->r_pgid = pgid;
-
-	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-	if (num < 0)
-		num = 0;
-
-	was_paused = req->r_paused;
-	req->r_paused = __req_should_be_paused(osdc, req);
-	if (was_paused && !req->r_paused)
-		force_resend = 1;
-
-	if ((!force_resend &&
-	     req->r_osd && req->r_osd->o_osd == o &&
-	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == num &&
-	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-	    (req->r_osd == NULL && o == -1) ||
-	    req->r_paused)
-		return 0;  /* no change */
-
-	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, pgid.pool, pgid.seed, o,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-	req->r_num_pg_osds = num;
-
-	if (req->r_osd) {
-		__cancel_request(req);
-		list_del_init(&req->r_osd_item);
-		list_del_init(&req->r_linger_osd_item);
-		req->r_osd = NULL;
-	}
-
-	req->r_osd = __lookup_osd(osdc, o);
-	if (!req->r_osd && o >= 0) {
-		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, o);
-		if (!req->r_osd) {
-			list_move(&req->r_req_lru_item, &osdc->req_notarget);
-			goto out;
+	struct ceph_osd_linger_request *lreq = req->r_priv;
+
+	mutex_lock(&lreq->lock);
+	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+	     lreq->last_error);
+	if (lreq->register_gen == req->r_ops[0].watch.gen) {
+		if (!req->r_result) {
+			lreq->watch_valid_thru = lreq->ping_sent;
+		} else if (!lreq->last_error) {
+			lreq->last_error = normalize_watch_error(req->r_result);
+			queue_watch_error(lreq);
 		}
+	} else {
+		dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+		     lreq->register_gen, req->r_ops[0].watch.gen);
+	}
+
+	mutex_unlock(&lreq->lock);
+	linger_put(lreq);
+}
 
-		dout("map_request osd %p is osd%d\n", req->r_osd, o);
-		__insert_osd(osdc, req->r_osd);
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_request *req = lreq->ping_req;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
 
-		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, o,
-			      &osdc->osdmap->osd_addr[o]);
+	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+		dout("%s PAUSERD\n", __func__);
+		return;
 	}
 
-	__enqueue_request(req);
-	err = 1;   /* osd or pg changed */
+	lreq->ping_sent = jiffies;
+	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+	     __func__, lreq, lreq->linger_id, lreq->ping_sent,
+	     lreq->register_gen);
 
-out:
-	return err;
+	if (req->r_osd)
+		cancel_linger_request(req);
+
+	request_reinit(req);
+	target_copy(&req->r_t, &lreq->t);
+
+	WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+		op->watch.cookie != lreq->linger_id ||
+		op->watch.op != CEPH_OSD_WATCH_OP_PING);
+	op->watch.gen = lreq->register_gen;
+	req->r_callback = linger_ping_cb;
+	req->r_priv = linger_get(lreq);
+	req->r_linger = true;
+
+	ceph_osdc_get_request(req);
+	account_request(req);
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(lreq->osd, req);
+	send_request(req);
 }
 
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
+static void linger_submit(struct ceph_osd_linger_request *lreq)
 {
-	void *p;
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd *osd;
 
-	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+	calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+	osd = lookup_create_osd(osdc, lreq->t.osd, true);
+	link_linger(osd, lreq);
 
-	/* fill in message content that changes each time we send it */
-	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
-	put_unaligned_le32(req->r_flags, req->r_request_flags);
-	put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
-	p = req->r_request_pgid;
-	ceph_encode_64(&p, req->r_pgid.pool);
-	ceph_encode_32(&p, req->r_pgid.seed);
-	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-	       sizeof(req->r_reassert_version));
+	send_linger(lreq);
+}
 
-	req->r_stamp = jiffies;
-	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_linger_request *lookup_lreq;
 
-	ceph_msg_get(req->r_request); /* send consumes a ref */
+	verify_osdc_wrlocked(osdc);
 
-	req->r_sent = req->r_osd->o_incarnation;
+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+				       lreq->linger_id);
+	if (!lookup_lreq)
+		return;
 
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	WARN_ON(lookup_lreq != lreq);
+	erase_linger_mc(&osdc->linger_map_checks, lreq);
+	linger_put(lreq);
 }
 
 /*
- * Send any requests in the queue (req_unsent).
+ * @lreq has to be both registered and linked.
  */
-static void __send_queued(struct ceph_osd_client *osdc)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-	struct ceph_osd_request *req, *tmp;
+	if (lreq->is_watch && lreq->ping_req->r_osd)
+		cancel_linger_request(lreq->ping_req);
+	if (lreq->reg_req->r_osd)
+		cancel_linger_request(lreq->reg_req);
+	cancel_linger_map_check(lreq);
+	unlink_linger(lreq->osd, lreq);
+	linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
 
-	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
-		__send_request(osdc, req);
+	down_write(&osdc->lock);
+	if (__linger_registered(lreq))
+		__linger_cancel(lreq);
+	up_write(&osdc->lock);
 }
 
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				     struct ceph_osd_request *req,
-				     bool nofail)
-{
-	int rc;
-
-	__register_request(osdc, req);
-	req->r_sent = 0;
-	req->r_got_reply = 0;
-	rc = __map_request(osdc, req, 0);
-	if (rc < 0) {
-		if (nofail) {
-			dout("osdc_start_request failed map, "
-				" will retry %lld\n", req->r_tid);
-			rc = 0;
-		} else {
-			__unregister_request(osdc, req);
-		}
-		return rc;
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osdmap *map = osdc->osdmap;
+
+	verify_osdc_wrlocked(osdc);
+	WARN_ON(!map->epoch);
+
+	if (lreq->register_gen) {
+		lreq->map_dne_bound = map->epoch;
+		dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+		     lreq, lreq->linger_id);
+	} else {
+		dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+		     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+		     map->epoch);
 	}
 
-	if (req->r_osd == NULL) {
-		dout("send_request %p no up osds in pg\n", req);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	if (lreq->map_dne_bound) {
+		if (map->epoch >= lreq->map_dne_bound) {
+			/* we had a new enough map */
+			pr_info("linger_id %llu pool does not exist\n",
+				lreq->linger_id);
+			linger_reg_commit_complete(lreq, -ENOENT);
+			__linger_cancel(lreq);
+		}
 	} else {
-		__send_queued(osdc);
+		send_linger_map_check(lreq);
 	}
+}
 
-	return 0;
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+	struct ceph_osd_linger_request *lreq;
+	u64 linger_id = greq->private_data;
+
+	WARN_ON(greq->result || !greq->u.newest);
+
+	down_write(&osdc->lock);
+	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+	if (!lreq) {
+		dout("%s linger_id %llu dne\n", __func__, linger_id);
+		goto out_unlock;
+	}
+
+	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+	     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+	     greq->u.newest);
+	if (!lreq->map_dne_bound)
+		lreq->map_dne_bound = greq->u.newest;
+	erase_linger_mc(&osdc->linger_map_checks, lreq);
+	check_linger_pool_dne(lreq);
+
+	linger_put(lreq);
+out_unlock:
+	up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_linger_request *lookup_lreq;
+	int ret;
+
+	verify_osdc_wrlocked(osdc);
+
+	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+				       lreq->linger_id);
+	if (lookup_lreq) {
+		WARN_ON(lookup_lreq != lreq);
+		return;
+	}
+
+	linger_get(lreq);
+	insert_linger_mc(&osdc->linger_map_checks, lreq);
+	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+					  linger_map_check_cb, lreq->linger_id);
+	WARN_ON(ret);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+	int ret;
+
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+	return ret ?: lreq->reg_commit_error;
+}
+
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+	int ret;
+
+	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+	return ret ?: lreq->notify_finish_error;
 }
 
 /*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
+ * Timeout callback, called every N seconds.  When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
  */
 static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
-	struct ceph_osd_request *req;
-	struct ceph_osd *osd;
-	struct list_head slow_osds;
-	dout("timeout\n");
-	down_read(&osdc->map_sem);
-
-	ceph_monc_request_next_osdmap(&osdc->client->monc);
+	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	LIST_HEAD(slow_osds);
+	struct rb_node *n, *p;
 
-	mutex_lock(&osdc->request_mutex);
+	dout("%s osdc %p\n", __func__, osdc);
+	down_write(&osdc->lock);
 
 	/*
 	 * ping osds that are a bit slow.  this ensures that if there
 	 * is a break in the TCP connection we will notice, and reopen
 	 * a connection with that osd (from the fault callback).
 	 */
-	INIT_LIST_HEAD(&slow_osds);
-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies,
-				req->r_stamp + opts->osd_keepalive_timeout))
-			break;
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+		bool found = false;
+
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (time_before(req->r_stamp, cutoff)) {
+				dout(" req %p tid %llu on osd%d is laggy\n",
+				     req, req->r_tid, osd->o_osd);
+				found = true;
+			}
+		}
+		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+			struct ceph_osd_linger_request *lreq =
+			    rb_entry(p, struct ceph_osd_linger_request, node);
+
+			dout(" lreq %p linger_id %llu is served by osd%d\n",
+			     lreq, lreq->linger_id, osd->o_osd);
+			found = true;
+
+			mutex_lock(&lreq->lock);
+			if (lreq->is_watch && lreq->committed && !lreq->last_error)
+				send_linger_ping(lreq);
+			mutex_unlock(&lreq->lock);
+		}
 
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
-		     req->r_tid, osd->o_osd);
-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+		if (found)
+			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
+
+	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+		maybe_request_map(osdc);
+
 	while (!list_empty(&slow_osds)) {
-		osd = list_entry(slow_osds.next, struct ceph_osd,
-				 o_keepalive_item);
+		struct ceph_osd *osd = list_first_entry(&slow_osds,
+							struct ceph_osd,
+							o_keepalive_item);
 		list_del_init(&osd->o_keepalive_item);
 		ceph_con_keepalive(&osd->o_con);
 	}
 
-	__schedule_osd_timeout(osdc);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	up_write(&osdc->lock);
+	schedule_delayed_work(&osdc->timeout_work,
+			      osdc->client->options->osd_keepalive_timeout);
 }
 
 static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2541,20 @@ static void handle_osds_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client,
 			     osds_timeout_work.work);
 	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+	struct ceph_osd *osd, *nosd;
 
-	dout("osds timeout\n");
-	down_read(&osdc->map_sem);
-	remove_old_osds(osdc);
-	up_read(&osdc->map_sem);
+	dout("%s osdc %p\n", __func__, osdc);
+	down_write(&osdc->lock);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (time_before(jiffies, osd->lru_ttl))
+			break;
 
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+		close_osd(osd);
+	}
+
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->osds_timeout_work,
 			      round_jiffies_relative(delay));
 }
@@ -1776,107 +2662,76 @@ e_inval:
 	goto out;
 }
 
-static void complete_request(struct ceph_osd_request *req)
-{
-	complete_all(&req->r_safe_completion);  /* fsync waiter */
-}
+struct MOSDOpReply {
+	struct ceph_pg pgid;
+	u64 flags;
+	int result;
+	u32 epoch;
+	int num_ops;
+	u32 outdata_len[CEPH_OSD_MAX_OPS];
+	s32 rval[CEPH_OSD_MAX_OPS];
+	int retry_attempt;
+	struct ceph_eversion replay_version;
+	u64 user_version;
+	struct ceph_request_redirect redirect;
+};
 
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
 {
-	void *p, *end;
-	struct ceph_osd_request *req;
-	struct ceph_request_redirect redir;
-	u64 tid;
-	int object_len;
-	unsigned int numops;
-	int payload_len, flags;
-	s32 result;
-	s32 retry_attempt;
-	struct ceph_pg pg;
-	int err;
-	u32 reassert_epoch;
-	u64 reassert_version;
-	u32 osdmap_epoch;
-	int already_completed;
-	u32 bytes;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	u16 version = le16_to_cpu(msg->hdr.version);
+	struct ceph_eversion bad_replay_version;
 	u8 decode_redir;
-	unsigned int i;
-
-	tid = le64_to_cpu(msg->hdr.tid);
-	dout("handle_reply %p tid %llu\n", msg, tid);
+	u32 len;
+	int ret;
+	int i;
 
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	ceph_decode_32_safe(&p, end, len, e_inval);
+	ceph_decode_need(&p, end, len, e_inval);
+	p += len; /* skip oid */
 
-	ceph_decode_need(&p, end, 4, bad);
-	object_len = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, object_len, bad);
-	p += object_len;
+	ret = ceph_decode_pgid(&p, end, &m->pgid);
+	if (ret)
+		return ret;
 
-	err = ceph_decode_pgid(&p, end, &pg);
-	if (err)
-		goto bad;
+	ceph_decode_64_safe(&p, end, m->flags, e_inval);
+	ceph_decode_32_safe(&p, end, m->result, e_inval);
+	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+	p += sizeof(bad_replay_version);
+	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
 
-	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
-	flags = ceph_decode_64(&p);
-	result = ceph_decode_32(&p);
-	reassert_epoch = ceph_decode_32(&p);
-	reassert_version = ceph_decode_64(&p);
-	osdmap_epoch = ceph_decode_32(&p);
-
-	/* lookup */
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
-	if (req == NULL) {
-		dout("handle_reply tid %llu dne\n", tid);
-		goto bad_mutex;
-	}
-	ceph_osdc_get_request(req);
+	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+		goto e_inval;
 
-	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
-	     req, result);
-
-	ceph_decode_need(&p, end, 4, bad_put);
-	numops = ceph_decode_32(&p);
-	if (numops > CEPH_OSD_MAX_OPS)
-		goto bad_put;
-	if (numops != req->r_num_ops)
-		goto bad_put;
-	payload_len = 0;
-	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-	for (i = 0; i < numops; i++) {
+	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+			 e_inval);
+	for (i = 0; i < m->num_ops; i++) {
 		struct ceph_osd_op *op = p;
-		int len;
 
-		len = le32_to_cpu(op->payload_len);
-		req->r_ops[i].outdata_len = len;
-		dout(" op %d has %d bytes\n", i, len);
-		payload_len += len;
+		m->outdata_len[i] = le32_to_cpu(op->payload_len);
 		p += sizeof(*op);
 	}
-	bytes = le32_to_cpu(msg->hdr.data_len);
-	if (payload_len != bytes) {
-		pr_warn("sum of op payload lens %d != data_len %d\n",
-			payload_len, bytes);
-		goto bad_put;
-	}
 
-	ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
-	retry_attempt = ceph_decode_32(&p);
-	for (i = 0; i < numops; i++)
-		req->r_ops[i].rval = ceph_decode_32(&p);
+	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+	for (i = 0; i < m->num_ops; i++)
+		ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
 
-	if (le16_to_cpu(msg->hdr.version) >= 6) {
-		p += 8 + 4; /* skip replay_version */
-		p += 8; /* skip user_version */
+	if (version >= 5) {
+		ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+		memcpy(&m->replay_version, p, sizeof(m->replay_version));
+		p += sizeof(m->replay_version);
+		ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+	} else {
+		m->replay_version = bad_replay_version; /* struct */
+		m->user_version = le64_to_cpu(m->replay_version.version);
+	}
 
-		if (le16_to_cpu(msg->hdr.version) >= 7)
-			ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+	if (version >= 6) {
+		if (version >= 7)
+			ceph_decode_8_safe(&p, end, decode_redir, e_inval);
 		else
 			decode_redir = 1;
 	} else {
@@ -1884,228 +2739,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	}
 
 	if (decode_redir) {
-		err = ceph_redirect_decode(&p, end, &redir);
-		if (err)
-			goto bad_put;
+		ret = ceph_redirect_decode(&p, end, &m->redirect);
+		if (ret)
+			return ret;
 	} else {
-		redir.oloc.pool = -1;
+		ceph_oloc_init(&m->redirect.oloc);
 	}
 
-	if (redir.oloc.pool != -1) {
-		dout("redirect pool %lld\n", redir.oloc.pool);
+	return 0;
 
-		__unregister_request(osdc, req);
+e_inval:
+	return -EINVAL;
+}
 
-		req->r_target_oloc = redir.oloc; /* struct */
+/*
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+			 const struct MOSDOpReply *m)
+{
+	return (m->result < 0 ||
+		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
+		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
 
-		/*
-		 * Start redirect requests with nofail=true.  If
-		 * mapping fails, request will end up on the notarget
-		 * list, waiting for the new osdmap (which can take
-		 * a while), even though the original request mapped
-		 * successfully.  In the future we might want to follow
-		 * original request's nofail setting here.
-		 */
-		err = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(err);
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set?	yes			no
+ *
+ * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
+ * any or needed/got safe)	r_safe_completion	r_safe_completion
+ *
+ * first reply is unsafe	r_unsafe_cb(true)	(nothing)
+ *
+ * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
+ *				r_safe_completion	r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_osd_request *req;
+	struct MOSDOpReply m;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 data_len = 0;
+	bool already_acked;
+	int ret;
+	int i;
 
-		goto out_unlock;
-	}
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
-	already_completed = req->r_got_reply;
-	if (!req->r_got_reply) {
-		req->r_result = result;
-		dout("handle_reply result %d bytes %d\n", req->r_result,
-		     bytes);
-		if (req->r_result == 0)
-			req->r_result = bytes;
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
 
-		/* in case this is a write and we need to replay, */
-		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_reassert_version.version = cpu_to_le64(reassert_version);
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
+	if (!req) {
+		dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+		goto out_unlock_session;
+	}
 
-		req->r_got_reply = 1;
-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-		dout("handle_reply tid %llu dup ack\n", tid);
-		goto out_unlock;
+	ret = decode_MOSDOpReply(msg, &m);
+	if (ret) {
+		pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+		       req->r_tid, ret);
+		ceph_msg_dump(msg);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+	     __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+	     m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+	     le64_to_cpu(m.replay_version.version), m.user_version);
+
+	if (m.retry_attempt >= 0) {
+		if (m.retry_attempt != req->r_attempts - 1) {
+			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+			     req, req->r_tid, m.retry_attempt,
+			     req->r_attempts - 1);
+			goto out_unlock_session;
+		}
+	} else {
+		WARN_ON(1); /* MOSDOpReply v4 is assumed */
 	}
 
-	dout("handle_reply tid %llu flags %d\n", tid, flags);
+	if (!ceph_oloc_empty(&m.redirect.oloc)) {
+		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+		     m.redirect.oloc.pool);
+		unlink_request(osd, req);
+		mutex_unlock(&osd->lock);
+
+		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+		req->r_tid = 0;
+		__submit_request(req, false);
+		goto out_unlock_osdc;
+	}
 
-	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
-		__register_linger_request(osdc, req);
+	if (m.num_ops != req->r_num_ops) {
+		pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+		       req->r_num_ops, req->r_tid);
+		goto fail_request;
+	}
+	for (i = 0; i < req->r_num_ops; i++) {
+		dout(" req %p tid %llu op %d rval %d len %u\n", req,
+		     req->r_tid, i, m.rval[i], m.outdata_len[i]);
+		req->r_ops[i].rval = m.rval[i];
+		req->r_ops[i].outdata_len = m.outdata_len[i];
+		data_len += m.outdata_len[i];
+	}
+	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+		pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+	     req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+	already_acked = req->r_got_reply;
+	if (!already_acked) {
+		req->r_result = m.result ?: data_len;
+		req->r_replay_version = m.replay_version; /* struct */
+		req->r_got_reply = true;
+	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+		dout("req %p tid %llu dup ack\n", req, req->r_tid);
+		goto out_unlock_session;
+	}
 
-	/* either this is a read, or we got the safe response */
-	if (result < 0 ||
-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-		__unregister_request(osdc, req);
+	if (done_request(req, &m)) {
+		__finish_request(req);
+		if (req->r_linger) {
+			WARN_ON(req->r_unsafe_callback);
+			dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+			__complete_request(req);
+		}
+	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	mutex_unlock(&osd->lock);
+	up_read(&osdc->lock);
 
-	if (!already_completed) {
-		if (req->r_unsafe_callback &&
-		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+	if (done_request(req, &m)) {
+		if (already_acked && req->r_unsafe_callback) {
+			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
+			req->r_unsafe_callback(req, false);
+		} else if (!req->r_linger) {
+			dout("req %p tid %llu cb\n", req, req->r_tid);
+			__complete_request(req);
+		}
+		if (m.flags & CEPH_OSD_FLAG_ONDISK)
+			complete_all(&req->r_safe_completion);
+		ceph_osdc_put_request(req);
+	} else {
+		if (req->r_unsafe_callback) {
+			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
 			req->r_unsafe_callback(req, true);
-		if (req->r_callback)
-			req->r_callback(req, msg);
-		else
-			complete_all(&req->r_completion);
+		} else {
+			WARN_ON(1);
+		}
 	}
 
-	if (flags & CEPH_OSD_FLAG_ONDISK) {
-		if (req->r_unsafe_callback && already_completed)
-			req->r_unsafe_callback(req, false);
-		complete_request(req);
+	return;
+
+fail_request:
+	complete_request(req, -EIO);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
+}
+
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pi =
+		    rb_entry(n, struct ceph_pg_pool_info, node);
+
+		pi->was_full = __pool_full(pi);
 	}
+}
 
-out:
-	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
-	ceph_osdc_put_request(req);
-	return;
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-	goto out;
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+	struct ceph_pg_pool_info *pi;
 
-bad_put:
-	req->r_result = -EIO;
-	__unregister_request(osdc, req);
-	if (req->r_callback)
-		req->r_callback(req, msg);
-	else
-		complete_all(&req->r_completion);
-	complete_request(req);
-	ceph_osdc_put_request(req);
-bad_mutex:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-bad:
-	pr_err("corrupt osd_op_reply got %d %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-	ceph_msg_dump(msg);
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
+
+	return pi->was_full && !__pool_full(pi);
 }
 
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
 {
-	struct rb_node *p, *n;
+	struct ceph_osd_client *osdc = lreq->osdc;
+	enum calc_target_result ct_res;
 
-	dout("%s %p\n", __func__, osdc);
-	for (p = rb_first(&osdc->osds); p; p = n) {
-		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+	ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+	if (ct_res == CALC_TARGET_NEED_RESEND) {
+		struct ceph_osd *osd;
 
-		n = rb_next(p);
-		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-		    memcmp(&osd->o_con.peer_addr,
-			   ceph_osd_addr(osdc->osdmap,
-					 osd->o_osd),
-			   sizeof(struct ceph_entity_addr)) != 0)
-			__reset_osd(osdc, osd);
+		osd = lookup_create_osd(osdc, lreq->t.osd, true);
+		if (osd != lreq->osd) {
+			unlink_linger(lreq->osd, lreq);
+			link_linger(osd, lreq);
+		}
 	}
+
+	return ct_res;
 }
 
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
  */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
-			  bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+			  bool force_resend,
+			  bool cleared_full,
+			  bool check_pool_cleared_full,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
-	struct ceph_osd_request *req, *nreq;
-	struct rb_node *p;
-	int needmap = 0;
-	int err;
-	bool force_resend_req;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+	bool force_resend_writes;
+
+	for (n = rb_first(&osd->o_linger_requests); n; ) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* recalc_linger_target() */
+
+		dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+		     lreq->linger_id);
+		ct_res = recalc_linger_target(lreq);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+			if (!force_resend && !force_resend_writes)
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			cancel_linger_map_check(lreq);
+			/*
+			 * scan_requests() for the previous epoch(s)
+			 * may have already added it to the list, since
+			 * it's not unlinked here.
+			 */
+			if (list_empty(&lreq->scan_item))
+				list_add_tail(&lreq->scan_item, need_resend_linger);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			check_linger_pool_dne(lreq);
+			break;
+		}
+	}
 
-	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
-		force_resend_writes ? " (force resend writes)" : "");
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; ) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-		p = rb_next(p);
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+		ct_res = calc_target(osdc, &req->r_t,
+				     &req->r_last_force_resend, false);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+			if (!force_resend &&
+			    (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+			     !force_resend_writes))
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			cancel_map_check(req);
+			unlink_request(osd, req);
+			insert_request(need_resend, req);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			check_pool_dne(req);
+			break;
+		}
+	}
+}
 
+static int handle_one_map(struct ceph_osd_client *osdc,
+			  void *p, void *end, bool incremental,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
+{
+	struct ceph_osdmap *newmap;
+	struct rb_node *n;
+	bool skipped_map = false;
+	bool was_full;
+
+	was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+	set_pool_was_full(osdc);
+
+	if (incremental)
+		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+	else
+		newmap = ceph_osdmap_decode(&p, end);
+	if (IS_ERR(newmap))
+		return PTR_ERR(newmap);
+
+	if (newmap != osdc->osdmap) {
 		/*
-		 * For linger requests that have not yet been
-		 * registered, move them to the linger list; they'll
-		 * be sent to the osd in the loop below.  Unregister
-		 * the request before re-registering it as a linger
-		 * request to ensure the __map_request() below
-		 * will decide it needs to be sent.
+		 * Preserve ->was_full before destroying the old map.
+		 * For pools that weren't in the old map, ->was_full
+		 * should be false.
 		 */
-		if (req->r_linger && list_empty(&req->r_linger_item)) {
-			dout("%p tid %llu restart on osd%d\n",
-			     req, req->r_tid,
-			     req->r_osd ? req->r_osd->o_osd : -1);
-			ceph_osdc_get_request(req);
-			__unregister_request(osdc, req);
-			__register_linger_request(osdc, req);
-			ceph_osdc_put_request(req);
-			continue;
+		for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+			struct ceph_pg_pool_info *pi =
+			    rb_entry(n, struct ceph_pg_pool_info, node);
+			struct ceph_pg_pool_info *old_pi;
+
+			old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+			if (old_pi)
+				pi->was_full = old_pi->was_full;
+			else
+				WARN_ON(pi->was_full);
 		}
 
-		force_resend_req = force_resend ||
-			(force_resend_writes &&
-				req->r_flags & CEPH_OSD_FLAG_WRITE);
-		err = __map_request(osdc, req, force_resend_req);
-		if (err < 0)
-			continue;  /* error */
-		if (req->r_osd == NULL) {
-			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-			needmap++;  /* request a newer map */
-		} else if (err > 0) {
-			if (!req->r_linger) {
-				dout("%p tid %llu requeued on osd%d\n", req,
-				     req->r_tid,
-				     req->r_osd ? req->r_osd->o_osd : -1);
-				req->r_flags |= CEPH_OSD_FLAG_RETRY;
-			}
+		if (osdc->osdmap->epoch &&
+		    osdc->osdmap->epoch + 1 < newmap->epoch) {
+			WARN_ON(incremental);
+			skipped_map = true;
 		}
+
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = newmap;
 	}
 
-	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
-				 r_linger_item) {
-		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
-		err = __map_request(osdc, req,
-				    force_resend || force_resend_writes);
-		dout("__map_request returned %d\n", err);
-		if (err < 0)
-			continue;  /* hrm! */
-		if (req->r_osd == NULL || err > 0) {
-			if (req->r_osd == NULL) {
-				dout("lingering %p tid %llu maps to no osd\n",
-				     req, req->r_tid);
-				/*
-				 * A homeless lingering request makes
-				 * no sense, as it's job is to keep
-				 * a particular OSD connection open.
-				 * Request a newer map and kick the
-				 * request, knowing that it won't be
-				 * resent until we actually get a map
-				 * that can tell us where to send it.
-				 */
-				needmap++;
-			}
+	was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+		      need_resend, need_resend_linger);
+
+	for (n = rb_first(&osdc->osds); n; ) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		n = rb_next(n); /* close_osd() */
+
+		scan_requests(osd, skipped_map, was_full, true, need_resend,
+			      need_resend_linger);
+		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+		    memcmp(&osd->o_con.peer_addr,
+			   ceph_osd_addr(osdc->osdmap, osd->o_osd),
+			   sizeof(struct ceph_entity_addr)))
+			close_osd(osd);
+	}
 
-			dout("kicking lingering %p tid %llu osd%d\n", req,
-			     req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
-			__register_request(osdc, req);
-			__unregister_linger_request(osdc, req);
+	return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
+{
+	struct ceph_osd_linger_request *lreq, *nlreq;
+	struct rb_node *n;
+
+	for (n = rb_first(need_resend); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		struct ceph_osd *osd;
+
+		n = rb_next(n);
+		erase_request(need_resend, req); /* before link_request() */
+
+		WARN_ON(req->r_osd);
+		calc_target(osdc, &req->r_t, NULL, false);
+		osd = lookup_create_osd(osdc, req->r_t.osd, true);
+		link_request(osd, req);
+		if (!req->r_linger) {
+			if (!osd_homeless(osd) && !req->r_t.paused)
+				send_request(req);
+		} else {
+			cancel_linger_request(req);
 		}
 	}
-	reset_changed_osds(osdc);
-	mutex_unlock(&osdc->request_mutex);
 
-	if (needmap) {
-		dout("%d requests for down osds, need new map\n", needmap);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+		if (!osd_homeless(lreq->osd))
+			send_linger(lreq);
+
+		list_del_init(&lreq->scan_item);
 	}
 }
 
-
 /*
  * Process updated osd map.
  *
@@ -2115,27 +3152,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
  */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-	void *p, *end, *next;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
 	u32 nr_maps, maplen;
 	u32 epoch;
-	struct ceph_osdmap *newmap = NULL, *oldmap;
-	int err;
 	struct ceph_fsid fsid;
-	bool was_full;
+	struct rb_root need_resend = RB_ROOT;
+	LIST_HEAD(need_resend_linger);
+	bool handled_incremental = false;
+	bool was_pauserd, was_pausewr;
+	bool pauserd, pausewr;
+	int err;
 
-	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+	down_write(&osdc->lock);
 
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
 	if (ceph_check_fsid(osdc->client, &fsid) < 0)
-		return;
-
-	down_write(&osdc->map_sem);
+		goto bad;
 
-	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+	was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+	was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+		      ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+		      have_pool_full(osdc);
 
 	/* incremental maps */
 	ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3186,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		epoch = ceph_decode_32(&p);
 		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
-		next = p + maplen;
-		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+		if (osdc->osdmap->epoch &&
+		    osdc->osdmap->epoch + 1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
-			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap,
-							  &osdc->client->msgr);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
+			err = handle_one_map(osdc, p, p + maplen, true,
+					     &need_resend, &need_resend_linger);
+			if (err)
 				goto bad;
-			}
-			BUG_ON(!newmap);
-			if (newmap != osdc->osdmap) {
-				ceph_osdmap_destroy(osdc->osdmap);
-				osdc->osdmap = newmap;
-			}
-			was_full = was_full ||
-				ceph_osdmap_flag(osdc->osdmap,
-						 CEPH_OSDMAP_FULL);
-			kick_requests(osdc, 0, was_full);
+			handled_incremental = true;
 		} else {
 			dout("ignoring incremental map %u len %d\n",
 			     epoch, maplen);
 		}
-		p = next;
+		p += maplen;
 		nr_maps--;
 	}
-	if (newmap)
+	if (handled_incremental)
 		goto done;
 
 	/* full maps */
@@ -2186,455 +3216,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		if (nr_maps > 1) {
 			dout("skipping non-latest full map %u len %d\n",
 			     epoch, maplen);
-		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+		} else if (osdc->osdmap->epoch >= epoch) {
 			dout("skipping full map %u len %d, "
 			     "older than our %u\n", epoch, maplen,
 			     osdc->osdmap->epoch);
 		} else {
-			int skipped_map = 0;
-
 			dout("taking full map %u len %d\n", epoch, maplen);
-			newmap = ceph_osdmap_decode(&p, p+maplen);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
+			err = handle_one_map(osdc, p, p + maplen, false,
+					     &need_resend, &need_resend_linger);
+			if (err)
 				goto bad;
-			}
-			BUG_ON(!newmap);
-			oldmap = osdc->osdmap;
-			osdc->osdmap = newmap;
-			if (oldmap) {
-				if (oldmap->epoch + 1 < newmap->epoch)
-					skipped_map = 1;
-				ceph_osdmap_destroy(oldmap);
-			}
-			was_full = was_full ||
-				ceph_osdmap_flag(osdc->osdmap,
-						 CEPH_OSDMAP_FULL);
-			kick_requests(osdc, skipped_map, was_full);
 		}
 		p += maplen;
 		nr_maps--;
 	}
 
-	if (!osdc->osdmap)
-		goto bad;
 done:
-	downgrade_write(&osdc->map_sem);
-	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-			  osdc->osdmap->epoch);
-
 	/*
 	 * subscribe to subsequent osdmap updates if full to ensure
 	 * we find out when we are no longer full and stop returning
 	 * ENOSPC.
 	 */
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
-		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-	mutex_lock(&osdc->request_mutex);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+		  ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+		  have_pool_full(osdc);
+	if (was_pauserd || was_pausewr || pauserd || pausewr)
+		maybe_request_map(osdc);
+
+	kick_requests(osdc, &need_resend, &need_resend_linger);
+
+	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+			  osdc->osdmap->epoch);
+	up_write(&osdc->lock);
 	wake_up_all(&osdc->client->auth_wq);
 	return;
 
 bad:
 	pr_err("osdc handle_map corrupt msg\n");
 	ceph_msg_dump(msg);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
 }
 
 /*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
+ * Resubmit requests pending on the given osd.
  */
-static void __release_event(struct kref *kref)
+static void kick_osd_requests(struct ceph_osd *osd)
 {
-	struct ceph_osd_event *event =
-		container_of(kref, struct ceph_osd_event, kref);
+	struct rb_node *n;
 
-	dout("__release_event %p\n", event);
-	kfree(event);
-}
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
 
-static void get_event(struct ceph_osd_event *event)
-{
-	kref_get(&event->kref);
-}
+		n = rb_next(n); /* cancel_linger_request() */
 
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
-	kref_put(&event->kref, __release_event);
+		if (!req->r_linger) {
+			if (!req->r_t.paused)
+				send_request(req);
+		} else {
+			cancel_linger_request(req);
+		}
+	}
+	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+		struct ceph_osd_linger_request *lreq =
+		    rb_entry(n, struct ceph_osd_linger_request, node);
+
+		send_linger(lreq);
+	}
 }
-EXPORT_SYMBOL(ceph_osdc_put_event);
 
-static void __insert_event(struct ceph_osd_client *osdc,
-			     struct ceph_osd_event *new)
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
 {
-	struct rb_node **p = &osdc->event_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_event *event = NULL;
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
 
-	while (*p) {
-		parent = *p;
-		event = rb_entry(parent, struct ceph_osd_event, node);
-		if (new->cookie < event->cookie)
-			p = &(*p)->rb_left;
-		else if (new->cookie > event->cookie)
-			p = &(*p)->rb_right;
-		else
-			BUG();
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	down_write(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock;
 	}
 
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &osdc->event_tree);
+	if (!reopen_osd(osd))
+		kick_osd_requests(osd);
+	maybe_request_map(osdc);
+
+out_unlock:
+	up_write(&osdc->lock);
 }
 
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
-					        u64 cookie)
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+				struct ceph_msg *msg)
 {
-	struct rb_node **p = &osdc->event_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_event *event = NULL;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	struct ceph_osd_linger_request *lreq;
+	struct linger_work *lwork;
+	u8 proto_ver, opcode;
+	u64 cookie, notify_id;
+	u64 notifier_id = 0;
+	s32 return_code = 0;
+	void *payload = NULL;
+	u32 payload_len = 0;
 
-	while (*p) {
-		parent = *p;
-		event = rb_entry(parent, struct ceph_osd_event, node);
-		if (cookie < event->cookie)
-			p = &(*p)->rb_left;
-		else if (cookie > event->cookie)
-			p = &(*p)->rb_right;
-		else
-			return event;
+	ceph_decode_8_safe(&p, end, proto_ver, bad);
+	ceph_decode_8_safe(&p, end, opcode, bad);
+	ceph_decode_64_safe(&p, end, cookie, bad);
+	p += 8; /* skip ver */
+	ceph_decode_64_safe(&p, end, notify_id, bad);
+
+	if (proto_ver >= 1) {
+		ceph_decode_32_safe(&p, end, payload_len, bad);
+		ceph_decode_need(&p, end, payload_len, bad);
+		payload = p;
+		p += payload_len;
 	}
-	return NULL;
-}
 
-static void __remove_event(struct ceph_osd_event *event)
-{
-	struct ceph_osd_client *osdc = event->osdc;
+	if (le16_to_cpu(msg->hdr.version) >= 2)
+		ceph_decode_32_safe(&p, end, return_code, bad);
+
+	if (le16_to_cpu(msg->hdr.version) >= 3)
+		ceph_decode_64_safe(&p, end, notifier_id, bad);
 
-	if (!RB_EMPTY_NODE(&event->node)) {
-		dout("__remove_event removed %p\n", event);
-		rb_erase(&event->node, &osdc->event_tree);
-		ceph_osdc_put_event(event);
+	down_read(&osdc->lock);
+	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+	if (!lreq) {
+		dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+		     cookie);
+		goto out_unlock_osdc;
+	}
+
+	mutex_lock(&lreq->lock);
+	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+	     opcode, cookie, lreq, lreq->is_watch);
+	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+		if (!lreq->last_error) {
+			lreq->last_error = -ENOTCONN;
+			queue_watch_error(lreq);
+		}
+	} else if (!lreq->is_watch) {
+		/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+		if (lreq->notify_id && lreq->notify_id != notify_id) {
+			dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+			     lreq->notify_id, notify_id);
+		} else if (!completion_done(&lreq->notify_finish_wait)) {
+			struct ceph_msg_data *data =
+			    list_first_entry_or_null(&msg->data,
+						     struct ceph_msg_data,
+						     links);
+
+			if (data) {
+				if (lreq->preply_pages) {
+					WARN_ON(data->type !=
+							CEPH_MSG_DATA_PAGES);
+					*lreq->preply_pages = data->pages;
+					*lreq->preply_len = data->length;
+				} else {
+					ceph_release_page_vector(data->pages,
+					       calc_pages_for(0, data->length));
+				}
+			}
+			lreq->notify_finish_error = return_code;
+			complete_all(&lreq->notify_finish_wait);
+		}
 	} else {
-		dout("__remove_event didn't remove %p\n", event);
+		/* CEPH_WATCH_EVENT_NOTIFY */
+		lwork = lwork_alloc(lreq, do_watch_notify);
+		if (!lwork) {
+			pr_err("failed to allocate notify-lwork\n");
+			goto out_unlock_lreq;
+		}
+
+		lwork->notify.notify_id = notify_id;
+		lwork->notify.notifier_id = notifier_id;
+		lwork->notify.payload = payload;
+		lwork->notify.payload_len = payload_len;
+		lwork->notify.msg = ceph_msg_get(msg);
+		lwork_queue(lwork);
 	}
+
+out_unlock_lreq:
+	mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
+	return;
+
+bad:
+	pr_err("osdc handle_watch_notify corrupt msg\n");
 }
 
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-			   void (*event_cb)(u64, u64, u8, void *),
-			   void *data, struct ceph_osd_event **pevent)
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
 {
-	struct ceph_osd_event *event;
+	down_read(&osdc->lock);
+	submit_request(req, false);
+	up_read(&osdc->lock);
 
-	event = kmalloc(sizeof(*event), GFP_NOIO);
-	if (!event)
-		return -ENOMEM;
-
-	dout("create_event %p\n", event);
-	event->cb = event_cb;
-	event->one_shot = 0;
-	event->data = data;
-	event->osdc = osdc;
-	INIT_LIST_HEAD(&event->osd_node);
-	RB_CLEAR_NODE(&event->node);
-	kref_init(&event->kref);   /* one ref for us */
-	kref_get(&event->kref);    /* one ref for the caller */
-
-	spin_lock(&osdc->event_lock);
-	event->cookie = ++osdc->event_count;
-	__insert_event(osdc, event);
-	spin_unlock(&osdc->event_lock);
-
-	*pevent = event;
 	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_start_request);
 
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-	struct ceph_osd_client *osdc = event->osdc;
+	struct ceph_osd_client *osdc = req->r_osdc;
 
-	dout("cancel_event %p\n", event);
-	spin_lock(&osdc->event_lock);
-	__remove_event(event);
-	spin_unlock(&osdc->event_lock);
-	ceph_osdc_put_event(event); /* caller's */
+	down_write(&osdc->lock);
+	if (req->r_osd)
+		cancel_request(req);
+	up_write(&osdc->lock);
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
-
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
-static void do_event_work(struct work_struct *work)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+				unsigned long timeout)
 {
-	struct ceph_osd_event_work *event_work =
-		container_of(work, struct ceph_osd_event_work, work);
-	struct ceph_osd_event *event = event_work->event;
-	u64 ver = event_work->ver;
-	u64 notify_id = event_work->notify_id;
-	u8 opcode = event_work->opcode;
+	long left;
+
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+	left = wait_for_completion_killable_timeout(&req->r_completion,
+						ceph_timeout_jiffies(timeout));
+	if (left <= 0) {
+		left = left ?: -ETIMEDOUT;
+		ceph_osdc_cancel_request(req);
 
-	dout("do_event_work completing %p\n", event);
-	event->cb(ver, notify_id, opcode, event->data);
-	dout("do_event_work completed %p\n", event);
-	ceph_osdc_put_event(event);
-	kfree(event_work);
+		/* kludge - need to to wake ceph_osdc_sync() */
+		complete_all(&req->r_safe_completion);
+	} else {
+		left = req->r_result; /* completed */
+	}
+
+	return left;
 }
 
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
 
 /*
- * Process osd watch notifications
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
  */
-static void handle_watch_notify(struct ceph_osd_client *osdc,
-				struct ceph_msg *msg)
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
-	void *p, *end;
-	u8 proto_ver;
-	u64 cookie, ver, notify_id;
-	u8 opcode;
-	struct ceph_osd_event *event;
-	struct ceph_osd_event_work *event_work;
+	struct rb_node *n, *p;
+	u64 last_tid = atomic64_read(&osdc->last_tid);
 
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+again:
+	down_read(&osdc->lock);
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 
-	ceph_decode_8_safe(&p, end, proto_ver, bad);
-	ceph_decode_8_safe(&p, end, opcode, bad);
-	ceph_decode_64_safe(&p, end, cookie, bad);
-	ceph_decode_64_safe(&p, end, ver, bad);
-	ceph_decode_64_safe(&p, end, notify_id, bad);
+		mutex_lock(&osd->lock);
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (req->r_tid > last_tid)
+				break;
 
-	spin_lock(&osdc->event_lock);
-	event = __find_event(osdc, cookie);
-	if (event) {
-		BUG_ON(event->one_shot);
-		get_event(event);
-	}
-	spin_unlock(&osdc->event_lock);
-	dout("handle_watch_notify cookie %lld ver %lld event %p\n",
-	     cookie, ver, event);
-	if (event) {
-		event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-		if (!event_work) {
-			pr_err("couldn't allocate event_work\n");
-			ceph_osdc_put_event(event);
-			return;
+			if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+				continue;
+
+			ceph_osdc_get_request(req);
+			mutex_unlock(&osd->lock);
+			up_read(&osdc->lock);
+			dout("%s waiting on req %p tid %llu last_tid %llu\n",
+			     __func__, req, req->r_tid, last_tid);
+			wait_for_completion(&req->r_safe_completion);
+			ceph_osdc_put_request(req);
+			goto again;
 		}
-		INIT_WORK(&event_work->work, do_event_work);
-		event_work->event = event;
-		event_work->ver = ver;
-		event_work->notify_id = notify_id;
-		event_work->opcode = opcode;
 
-		queue_work(osdc->notify_wq, &event_work->work);
+		mutex_unlock(&osd->lock);
 	}
 
-	return;
+	up_read(&osdc->lock);
+	dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
 
-bad:
-	pr_err("osdc handle_watch_notify corrupt msg\n");
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+	struct ceph_osd_request *req;
+
+	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return NULL;
+
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+
+	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+
+	return req;
 }
 
 /*
- * build new request AND message
- *
+ * Returns a handle, caller owns a ref.
  */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				struct ceph_snap_context *snapc, u64 snap_id,
-				struct timespec *mtime)
-{
-	struct ceph_msg *msg = req->r_request;
-	void *p;
-	size_t msg_size;
-	int flags = req->r_flags;
-	u64 data_len;
-	unsigned int i;
-
-	req->r_snapid = snap_id;
-	req->r_snapc = ceph_get_snap_context(snapc);
-
-	/* encode request */
-	msg->hdr.version = cpu_to_le16(4);
-
-	p = msg->front.iov_base;
-	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-	req->r_request_osdmap_epoch = p;
-	p += 4;
-	req->r_request_flags = p;
-	p += 4;
-	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(p, mtime);
-	p += sizeof(struct ceph_timespec);
-	req->r_request_reassert_version = p;
-	p += sizeof(struct ceph_eversion); /* will get filled in */
-
-	/* oloc */
-	ceph_encode_8(&p, 4);
-	ceph_encode_8(&p, 4);
-	ceph_encode_32(&p, 8 + 4 + 4);
-	req->r_request_pool = p;
-	p += 8;
-	ceph_encode_32(&p, -1);  /* preferred */
-	ceph_encode_32(&p, 0);   /* key len */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+		struct ceph_object_id *oid,
+		struct ceph_object_locator *oloc,
+		rados_watchcb2_t wcb,
+		rados_watcherrcb_t errcb,
+		void *data)
+{
+	struct ceph_osd_linger_request *lreq;
+	int ret;
 
-	ceph_encode_8(&p, 1);
-	req->r_request_pgid = p;
-	p += 8 + 4;
-	ceph_encode_32(&p, -1);  /* preferred */
+	lreq = linger_alloc(osdc);
+	if (!lreq)
+		return ERR_PTR(-ENOMEM);
 
-	/* oid */
-	ceph_encode_32(&p, req->r_base_oid.name_len);
-	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
-	     req->r_base_oid.name, req->r_base_oid.name_len);
-	p += req->r_base_oid.name_len;
-
-	/* ops--can imply data */
-	ceph_encode_16(&p, (u16)req->r_num_ops);
-	data_len = 0;
-	for (i = 0; i < req->r_num_ops; i++) {
-		data_len += osd_req_encode_op(req, p, i);
-		p += sizeof(struct ceph_osd_op);
+	lreq->is_watch = true;
+	lreq->wcb = wcb;
+	lreq->errcb = errcb;
+	lreq->data = data;
+	lreq->watch_valid_thru = jiffies;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	lreq->mtime = CURRENT_TIME;
+
+	lreq->reg_req = alloc_linger_request(lreq);
+	if (!lreq->reg_req) {
+		ret = -ENOMEM;
+		goto err_put_lreq;
 	}
 
-	/* snaps */
-	ceph_encode_64(&p, req->r_snapid);
-	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-	if (req->r_snapc) {
-		for (i = 0; i < snapc->num_snaps; i++) {
-			ceph_encode_64(&p, req->r_snapc->snaps[i]);
-		}
+	lreq->ping_req = alloc_linger_request(lreq);
+	if (!lreq->ping_req) {
+		ret = -ENOMEM;
+		goto err_put_lreq;
 	}
 
-	req->r_request_attempts = p;
-	p += 4;
-
-	/* data */
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		u16 data_off;
-
-		/*
-		 * The header "data_off" is a hint to the receiver
-		 * allowing it to align received data into its
-		 * buffers such that there's no need to re-copy
-		 * it before writing it to disk (direct I/O).
-		 */
-		data_off = (u16) (off & 0xffff);
-		req->r_request->hdr.data_off = cpu_to_le16(data_off);
+	down_write(&osdc->lock);
+	linger_register(lreq); /* before osd_req_op_* */
+	osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_WATCH);
+	osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_PING);
+	linger_submit(lreq);
+	up_write(&osdc->lock);
+
+	ret = linger_reg_commit_wait(lreq);
+	if (ret) {
+		linger_cancel(lreq);
+		goto err_put_lreq;
 	}
-	req->r_request->hdr.data_len = cpu_to_le32(data_len);
 
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return lreq;
 
-	dout("build_request msg_size was %d\n", (int)msg_size);
+err_put_lreq:
+	linger_put(lreq);
+	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_build_request);
+EXPORT_SYMBOL(ceph_osdc_watch);
 
 /*
- * Register request, send initial attempt.
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
  */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-			    struct ceph_osd_request *req,
-			    bool nofail)
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+		      struct ceph_osd_linger_request *lreq)
 {
-	int rc;
+	struct ceph_options *opts = osdc->client->options;
+	struct ceph_osd_request *req;
+	int ret;
 
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
 
-	rc = __ceph_osdc_start_request(osdc, req, nofail);
+	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	req->r_mtime = CURRENT_TIME;
+	osd_req_op_watch_init(req, 0, lreq->linger_id,
+			      CEPH_OSD_WATCH_OP_UNWATCH);
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
 
-	return rc;
+	ceph_osdc_start_request(osdc, req, false);
+	linger_cancel(lreq);
+	linger_put(lreq);
+	ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_unwatch);
 
-/*
- * Unregister a registered request.  The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+				      u64 notify_id, u64 cookie, void *payload,
+				      size_t payload_len)
 {
-	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd_req_op *op;
+	struct ceph_pagelist *pl;
+	int ret;
+
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
+	if (!pl)
+		return -ENOMEM;
 
-	mutex_lock(&osdc->request_mutex);
-	if (req->r_linger)
-		__unregister_linger_request(osdc, req);
-	__unregister_request(osdc, req);
-	mutex_unlock(&osdc->request_mutex);
+	ceph_pagelist_init(pl);
+	ret = ceph_pagelist_encode_64(pl, notify_id);
+	ret |= ceph_pagelist_encode_64(pl, cookie);
+	if (payload) {
+		ret |= ceph_pagelist_encode_32(pl, payload_len);
+		ret |= ceph_pagelist_append(pl, payload, payload_len);
+	} else {
+		ret |= ceph_pagelist_encode_32(pl, 0);
+	}
+	if (ret) {
+		ceph_pagelist_release(pl);
+		return -ENOMEM;
+	}
 
-	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+	op->indata_len = pl->length;
+	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+			 struct ceph_object_id *oid,
+			 struct ceph_object_locator *oloc,
+			 u64 notify_id,
+			 u64 cookie,
+			 void *payload,
+			 size_t payload_len)
+{
+	struct ceph_osd_request *req;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = CEPH_OSD_FLAG_READ;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		goto out_put_req;
+
+	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+					 payload_len);
+	if (ret)
+		goto out_put_req;
+
+	ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+				  u64 cookie, u32 prot_ver, u32 timeout,
+				  void *payload, size_t payload_len)
 {
-	int rc;
+	struct ceph_osd_req_op *op;
+	struct ceph_pagelist *pl;
+	int ret;
 
-	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+	op->notify.cookie = cookie;
 
-	rc = wait_for_completion_interruptible(&req->r_completion);
-	if (rc < 0) {
-		dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
-		ceph_osdc_cancel_request(req);
-		complete_request(req);
-		return rc;
+	pl = kmalloc(sizeof(*pl), GFP_NOIO);
+	if (!pl)
+		return -ENOMEM;
+
+	ceph_pagelist_init(pl);
+	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+	ret |= ceph_pagelist_encode_32(pl, timeout);
+	ret |= ceph_pagelist_encode_32(pl, payload_len);
+	ret |= ceph_pagelist_append(pl, payload, payload_len);
+	if (ret) {
+		ceph_pagelist_release(pl);
+		return -ENOMEM;
 	}
 
-	dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
-	     req->r_result);
-	return req->r_result;
+	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+	op->indata_len = pl->length;
+	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_wait_request);
 
 /*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
  */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+		     struct ceph_object_id *oid,
+		     struct ceph_object_locator *oloc,
+		     void *payload,
+		     size_t payload_len,
+		     u32 timeout,
+		     struct page ***preply_pages,
+		     size_t *preply_len)
 {
-	struct ceph_osd_request *req;
-	u64 last_tid, next_tid = 0;
+	struct ceph_osd_linger_request *lreq;
+	struct page **pages;
+	int ret;
 
-	mutex_lock(&osdc->request_mutex);
-	last_tid = osdc->last_tid;
-	while (1) {
-		req = __lookup_request_ge(osdc, next_tid);
-		if (!req)
-			break;
-		if (req->r_tid > last_tid)
-			break;
+	WARN_ON(!timeout);
+	if (preply_pages) {
+		*preply_pages = NULL;
+		*preply_len = 0;
+	}
 
-		next_tid = req->r_tid + 1;
-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-			continue;
+	lreq = linger_alloc(osdc);
+	if (!lreq)
+		return -ENOMEM;
 
-		ceph_osdc_get_request(req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("sync waiting on tid %llu (last is %llu)\n",
-		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&osdc->request_mutex);
-		ceph_osdc_put_request(req);
+	lreq->preply_pages = preply_pages;
+	lreq->preply_len = preply_len;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+	lreq->reg_req = alloc_linger_request(lreq);
+	if (!lreq->reg_req) {
+		ret = -ENOMEM;
+		goto out_put_lreq;
 	}
-	mutex_unlock(&osdc->request_mutex);
-	dout("sync done (thru tid %llu)\n", last_tid);
+
+	/* for notify_id */
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		goto out_put_lreq;
+	}
+
+	down_write(&osdc->lock);
+	linger_register(lreq); /* before osd_req_op_* */
+	ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+				     timeout, payload, payload_len);
+	if (ret) {
+		linger_unregister(lreq);
+		up_write(&osdc->lock);
+		ceph_release_page_vector(pages, 1);
+		goto out_put_lreq;
+	}
+	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+						 response_data),
+				 pages, PAGE_SIZE, 0, false, true);
+	linger_submit(lreq);
+	up_write(&osdc->lock);
+
+	ret = linger_reg_commit_wait(lreq);
+	if (!ret)
+		ret = linger_notify_finish_wait(lreq);
+	else
+		dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+	linger_cancel(lreq);
+out_put_lreq:
+	linger_put(lreq);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+			  struct ceph_osd_linger_request *lreq)
+{
+	unsigned long stamp, age;
+	int ret;
+
+	down_read(&osdc->lock);
+	mutex_lock(&lreq->lock);
+	stamp = lreq->watch_valid_thru;
+	if (!list_empty(&lreq->pending_lworks)) {
+		struct linger_work *lwork =
+		    list_first_entry(&lreq->pending_lworks,
+				     struct linger_work,
+				     pending_item);
+
+		if (time_before(lwork->queued_stamp, stamp))
+			stamp = lwork->queued_stamp;
+	}
+	age = jiffies - stamp;
+	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+	     lreq, lreq->linger_id, age, lreq->last_error);
+	/* we are truncating to msecs, so return a safe upper bound */
+	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+	mutex_unlock(&lreq->lock);
+	up_read(&osdc->lock);
+	return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_sync);
 
 /*
  * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3868,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
 
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+	down_read(&osdc->lock);
+	maybe_request_map(osdc);
+	up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 
 /*
  * init, shutdown
@@ -2656,43 +3885,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	dout("init\n");
 	osdc->client = client;
-	osdc->osdmap = NULL;
-	init_rwsem(&osdc->map_sem);
-	init_completion(&osdc->map_waiters);
-	osdc->last_requested_map = 0;
-	mutex_init(&osdc->request_mutex);
-	osdc->last_tid = 0;
+	init_rwsem(&osdc->lock);
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
-	osdc->requests = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->req_lru);
-	INIT_LIST_HEAD(&osdc->req_unsent);
-	INIT_LIST_HEAD(&osdc->req_notarget);
-	INIT_LIST_HEAD(&osdc->req_linger);
-	osdc->num_requests = 0;
+	spin_lock_init(&osdc->osd_lru_lock);
+	osd_init(&osdc->homeless_osd);
+	osdc->homeless_osd.o_osdc = osdc;
+	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+	osdc->linger_requests = RB_ROOT;
+	osdc->map_checks = RB_ROOT;
+	osdc->linger_map_checks = RB_ROOT;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-	spin_lock_init(&osdc->event_lock);
-	osdc->event_tree = RB_ROOT;
-	osdc->event_count = 0;
-
-	schedule_delayed_work(&osdc->osds_timeout_work,
-	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
 	err = -ENOMEM;
+	osdc->osdmap = ceph_osdmap_alloc();
+	if (!osdc->osdmap)
+		goto out;
+
 	osdc->req_mempool = mempool_create_slab_pool(10,
 						     ceph_osd_request_cache);
 	if (!osdc->req_mempool)
-		goto out;
+		goto out_map;
 
 	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
-				OSD_OP_FRONT_LEN, 10, true,
-				"osd_op");
+				PAGE_SIZE, 10, true, "osd_op");
 	if (err < 0)
 		goto out_mempool;
 	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
-				OSD_OPREPLY_FRONT_LEN, 10, true,
-				"osd_op_reply");
+				PAGE_SIZE, 10, true, "osd_op_reply");
 	if (err < 0)
 		goto out_msgpool;
 
@@ -2701,6 +3922,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->notify_wq)
 		goto out_msgpool_reply;
 
+	schedule_delayed_work(&osdc->timeout_work,
+			      osdc->client->options->osd_keepalive_timeout);
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
 	return 0;
 
 out_msgpool_reply:
@@ -2709,6 +3935,8 @@ out_msgpool:
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
 	mempool_destroy(osdc->req_mempool);
+out_map:
+	ceph_osdmap_destroy(osdc->osdmap);
 out:
 	return err;
 }
@@ -2719,11 +3947,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	destroy_workqueue(osdc->notify_wq);
 	cancel_delayed_work_sync(&osdc->timeout_work);
 	cancel_delayed_work_sync(&osdc->osds_timeout_work);
-	if (osdc->osdmap) {
-		ceph_osdmap_destroy(osdc->osdmap);
-		osdc->osdmap = NULL;
+
+	down_write(&osdc->lock);
+	while (!RB_EMPTY_ROOT(&osdc->osds)) {
+		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+						struct ceph_osd, o_node);
+		close_osd(osd);
 	}
-	remove_all_osds(osdc);
+	up_write(&osdc->lock);
+	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+	osd_cleanup(&osdc->homeless_osd);
+
+	WARN_ON(!list_empty(&osdc->osd_lru));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+	WARN_ON(atomic_read(&osdc->num_requests));
+	WARN_ON(atomic_read(&osdc->num_homeless));
+
+	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3994,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-
 	osd_req_op_extent_osd_data_pages(req, 0,
 				pages, *plen, page_align, false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
 	     off, *plen, *plen, page_align);
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4025,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
-	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4038,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+	req->r_mtime = *mtime;
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4078,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(msg->hdr.type);
 
-	if (!osd)
-		goto out;
-	osdc = osd->o_osdc;
-
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 		ceph_osdc_handle_map(osdc, msg);
 		break;
 	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg);
+		handle_reply(osd, msg);
 		break;
 	case CEPH_MSG_WATCH_NOTIFY:
 		handle_watch_notify(osdc, msg);
@@ -2863,7 +4096,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
-out:
+
 	ceph_msg_put(msg);
 }
 
@@ -2878,21 +4111,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
-	struct ceph_msg *m;
+	struct ceph_msg *m = NULL;
 	struct ceph_osd_request *req;
 	int front_len = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
-	u64 tid;
+	u64 tid = le64_to_cpu(hdr->tid);
 
-	tid = le64_to_cpu(hdr->tid);
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+		*skip = 1;
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4143,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
 				 false);
 		if (!m)
-			goto out;
+			goto out_unlock_session;
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
 	}
@@ -2915,14 +4154,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 			req->r_reply->data_length);
 		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	m = ceph_msg_get(req->r_reply);
 	dout("get_reply tid %lld %p\n", tid, m);
 
-out:
-	mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
+	return m;
+}
+
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+	struct ceph_msg *m;
+	int type = le16_to_cpu(hdr->type);
+	u32 front_len = le32_to_cpu(hdr->front_len);
+	u32 data_len = le32_to_cpu(hdr->data_len);
+
+	m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+	if (!m)
+		return NULL;
+
+	if (data_len) {
+		struct page **pages;
+		struct ceph_osd_data osd_data;
+
+		pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+					       GFP_NOIO);
+		if (!pages) {
+			ceph_msg_put(m);
+			return NULL;
+		}
+
+		ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+					 false);
+		ceph_osdc_msg_data_add(m, &osd_data);
+	}
+
 	return m;
 }
 
@@ -2932,18 +4206,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 {
 	struct ceph_osd *osd = con->private;
 	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
 
 	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 	case CEPH_MSG_WATCH_NOTIFY:
-		return ceph_msg_new(type, front, GFP_NOFS, false);
+		return alloc_msg_with_page_vector(hdr);
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-			osd->o_osd);
+		pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+			osd->o_osd, type);
 		*skip = 1;
 		return NULL;
 	}
@@ -3047,5 +4320,5 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.alloc_msg = alloc_msg,
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
-	.fault = osd_reset,
+	.fault = osd_fault,
 };
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..03062bb763b3 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
 	return ERR_PTR(err);
 }
 
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
 {
-	if (l.pool < r.pool)
+	if (lhs->pool < rhs->pool)
 		return -1;
-	if (l.pool > r.pool)
+	if (lhs->pool > rhs->pool)
 		return 1;
-	if (l.seed < r.seed)
+	if (lhs->seed < rhs->seed)
 		return -1;
-	if (l.seed > r.seed)
+	if (lhs->seed > rhs->seed)
 		return 1;
+
 	return 0;
 }
 
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 			       struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 	while (*p) {
 		parent = *p;
 		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		c = pgid_cmp(new->pgid, pg->pgid);
+		c = ceph_pg_compare(&new->pgid, &pg->pgid);
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
 	while (n) {
 		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = pgid_cmp(pgid, pg->pgid);
+		c = ceph_pg_compare(&pgid, &pg->pgid);
 		if (c < 0) {
 			n = n->rb_left;
 		} else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 	*p += 4;  /* skip crash_replay_interval */
 
 	if (ev >= 7)
-		*p += 1;  /* skip min_size */
+		pi->min_size = ceph_decode_8(p);
+	else
+		pi->min_size = pi->size - pi->size / 2;
 
 	if (ev >= 8)
 		*p += 8 + 8;  /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 		pi->write_tier = -1;
 	}
 
+	if (ev >= 10) {
+		/* skip properties */
+		num = ceph_decode_32(p);
+		while (num--) {
+			len = ceph_decode_32(p);
+			*p += len; /* key */
+			len = ceph_decode_32(p);
+			*p += len; /* val */
+		}
+	}
+
+	if (ev >= 11) {
+		/* skip hit_set_params */
+		*p += 1 + 1; /* versions */
+		len = ceph_decode_32(p);
+		*p += len;
+
+		*p += 4; /* skip hit_set_period */
+		*p += 4; /* skip hit_set_count */
+	}
+
+	if (ev >= 12)
+		*p += 4; /* skip stripe_width */
+
+	if (ev >= 13) {
+		*p += 8; /* skip target_max_bytes */
+		*p += 8; /* skip target_max_objects */
+		*p += 4; /* skip cache_target_dirty_ratio_micro */
+		*p += 4; /* skip cache_target_full_ratio_micro */
+		*p += 4; /* skip cache_min_flush_age */
+		*p += 4; /* skip cache_min_evict_age */
+	}
+
+	if (ev >=  14) {
+		/* skip erasure_code_profile */
+		len = ceph_decode_32(p);
+		*p += len;
+	}
+
+	if (ev >= 15)
+		pi->last_force_request_resend = ceph_decode_32(p);
+	else
+		pi->last_force_request_resend = 0;
+
 	/* ignore the rest */
 
 	*p = pool_end;
@@ -660,6 +707,23 @@ bad:
 /*
  * osd map
  */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+	struct ceph_osdmap *map;
+
+	map = kzalloc(sizeof(*map), GFP_NOIO);
+	if (!map)
+		return NULL;
+
+	map->pg_pools = RB_ROOT;
+	map->pool_max = -1;
+	map->pg_temp = RB_ROOT;
+	map->primary_temp = RB_ROOT;
+	mutex_init(&map->crush_scratch_mutex);
+
+	return map;
+}
+
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
 	dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	struct ceph_osdmap *map;
 	int ret;
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
+	map = ceph_osdmap_alloc();
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	map->pg_temp = RB_ROOT;
-	map->primary_temp = RB_ROOT;
-	mutex_init(&map->crush_scratch_mutex);
-
 	ret = osdmap_decode(p, end, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  * decode and apply an incremental map update.
  */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					     struct ceph_osdmap *map,
-					     struct ceph_messenger *msgr)
+					     struct ceph_osdmap *map)
 {
 	struct crush_map *newcrush = NULL;
 	struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
 	return ERR_PTR(err);
 }
 
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src)
+{
+	WARN_ON(!ceph_oid_empty(dest));
+
+	if (src->name != src->inline_name) {
+		/* very rare, see ceph_object_id definition */
+		dest->name = kmalloc(src->name_len + 1,
+				     GFP_NOIO | __GFP_NOFAIL);
+	}
+
+	memcpy(dest->name, src->name, src->name_len + 1);
+	dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+	int len;
+
+	WARN_ON(!ceph_oid_empty(oid));
+
+	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+	if (len >= sizeof(oid->inline_name))
+		return len;
+
+	oid->name_len = len;
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	BUG_ON(oid_printf_vargs(oid, fmt, ap));
+	va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+		      const char *fmt, va_list ap)
+{
+	va_list aq;
+	int len;
+
+	va_copy(aq, ap);
+	len = oid_printf_vargs(oid, fmt, aq);
+	va_end(aq);
+
+	if (len) {
+		char *external_name;
+
+		external_name = kmalloc(len + 1, gfp);
+		if (!external_name)
+			return -ENOMEM;
+
+		oid->name = external_name;
+		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+		oid->name_len = len;
+	}
+
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+	if (oid->name != oid->inline_name)
+		kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+			 const struct ceph_osds *rhs)
+{
+	if (lhs->size == rhs->size &&
+	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+		return true;
+
+	return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+		       const struct ceph_osds *rhs)
+{
+	if (__osds_equal(lhs, rhs) &&
+	    lhs->primary == rhs->primary)
+		return true;
+
+	return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+	/* non-empty set */
+	if (set->size > 0 && set->primary >= 0)
+		return true;
+
+	/* empty can_shift_osds set */
+	if (!set->size && set->primary == -1)
+		return true;
+
+	/* empty !can_shift_osds set - all NONE */
+	if (set->size > 0 && set->primary == -1) {
+		int i;
+
+		for (i = 0; i < set->size; i++) {
+			if (set->osds[i] != CRUSH_ITEM_NONE)
+				break;
+		}
+		if (i == set->size)
+			return true;
+	}
+
+	return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+	memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+	dest->size = src->size;
+	dest->primary = src->primary;
+}
+
+static bool is_split(const struct ceph_pg *pgid,
+		     u32 old_pg_num,
+		     u32 new_pg_num)
+{
+	int old_bits = calc_bits_of(old_pg_num);
+	int old_mask = (1 << old_bits) - 1;
+	int n;
+
+	WARN_ON(pgid->seed >= old_pg_num);
+	if (new_pg_num <= old_pg_num)
+		return false;
+
+	for (n = 1; ; n++) {
+		int next_bit = n << (old_bits - 1);
+		u32 s = next_bit | pgid->seed;
+
+		if (s < old_pg_num || s == pgid->seed)
+			continue;
+		if (s >= new_pg_num)
+			break;
+
+		s = ceph_stable_mod(s, old_pg_num, old_mask);
+		if (s == pgid->seed)
+			return true;
+	}
+
+	return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid)
+{
+	return !osds_equal(old_acting, new_acting) ||
+	       !osds_equal(old_up, new_up) ||
+	       old_size != new_size ||
+	       old_min_size != new_min_size ||
+	       is_split(pgid, old_pg_num, new_pg_num) ||
+	       old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+	int i;
+
+	for (i = 0; i < acting->size; i++) {
+		if (acting->osds[i] == osd)
+			return i;
+	}
+
+	return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+			    const struct ceph_osds *new_acting)
+{
+	if (!old_acting->size && !new_acting->size)
+		return false; /* both still empty */
 
+	if (!old_acting->size ^ !new_acting->size)
+		return true; /* was empty, now not, or vice versa */
 
+	if (old_acting->primary != new_acting->primary)
+		return true; /* primary changed */
+
+	if (calc_pg_rank(old_acting->primary, old_acting) !=
+	    calc_pg_rank(new_acting->primary, new_acting))
+		return true;
+
+	return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change)
+{
+	if (primary_changed(old_acting, new_acting))
+		return true;
+
+	if (any_change && !__osds_equal(old_acting, new_acting))
+		return true;
+
+	return false;
+}
 
 /*
  * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			struct ceph_object_locator *oloc,
-			struct ceph_object_id *oid,
-			struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid)
 {
 	struct ceph_pg_pool_info *pi;
 
-	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
 	if (!pi)
-		return -EIO;
+		return -ENOENT;
 
-	pg_out->pool = oloc->pool;
-	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-				     oid->name_len);
+	raw_pgid->pool = oloc->pool;
+	raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+				       oid->name_len);
 
-	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-	     pg_out->pool, pg_out->seed);
+	dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+	     raw_pgid->pool, raw_pgid->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+			 const struct ceph_pg *raw_pgid,
+			 struct ceph_pg *pgid)
+{
+	pgid->pool = raw_pgid->pool;
+	pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+				     pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+			 const struct ceph_pg *raw_pgid)
+{
+	if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+		/* hash pool id and seed so that pool PGs do not overlap */
+		return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+				      ceph_stable_mod(raw_pgid->seed,
+						      pi->pgp_num,
+						      pi->pgp_num_mask),
+				      raw_pgid->pool);
+	} else {
+		/*
+		 * legacy behavior: add ps and pool together.  this is
+		 * not a great approach because the PGs from each pool
+		 * will overlap on top of each other: 0.5 == 1.4 ==
+		 * 2.3 == ...
+		 */
+		return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+				       pi->pgp_num_mask) +
+		       (unsigned)raw_pgid->pool;
+	}
+}
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
  *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
  */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
-			  struct ceph_pg_pool_info *pool,
-			  struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+			   struct ceph_pg_pool_info *pi,
+			   const struct ceph_pg *raw_pgid,
+			   struct ceph_osds *raw,
+			   u32 *ppps)
 {
+	u32 pps = raw_pg_to_pps(pi, raw_pgid);
 	int ruleno;
 	int len;
 
-	/* crush */
-	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-				 pool->type, pool->size);
+	ceph_osds_init(raw);
+	if (ppps)
+		*ppps = pps;
+
+	ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+				 pi->size);
 	if (ruleno < 0) {
 		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-		       pgid.pool, pool->crush_ruleset, pool->type,
-		       pool->size);
-		return -ENOENT;
+		       pi->id, pi->crush_ruleset, pi->type, pi->size);
+		return;
 	}
 
-	len = do_crush(osdmap, ruleno, pps, osds,
-		       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+	len = do_crush(osdmap, ruleno, pps, raw->osds,
+		       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
 		       osdmap->osd_weight, osdmap->max_osd);
 	if (len < 0) {
 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-		       len, ruleno, pgid.pool, pool->crush_ruleset,
-		       pool->type, pool->size);
-		return len;
+		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+		       pi->size);
+		return;
 	}
 
-	return len;
+	raw->size = len;
 }
 
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
  *
- * Return up set length.  *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set.  If it's
+ * empty, ->primary will remain undefined.
  */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
-			  struct ceph_pg_pool_info *pool,
-			  int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+			   struct ceph_pg_pool_info *pi,
+			   struct ceph_osds *set)
 {
-	int up_primary = -1;
 	int i;
 
-	if (ceph_can_shift_osds(pool)) {
+	/* ->primary is undefined for a raw set */
+	BUG_ON(set->primary != -1);
+
+	if (ceph_can_shift_osds(pi)) {
 		int removed = 0;
 
-		for (i = 0; i < len; i++) {
-			if (ceph_osd_is_down(osdmap, osds[i])) {
+		/* shift left */
+		for (i = 0; i < set->size; i++) {
+			if (ceph_osd_is_down(osdmap, set->osds[i])) {
 				removed++;
 				continue;
 			}
 			if (removed)
-				osds[i - removed] = osds[i];
+				set->osds[i - removed] = set->osds[i];
 		}
-
-		len -= removed;
-		if (len > 0)
-			up_primary = osds[0];
+		set->size -= removed;
+		if (set->size > 0)
+			set->primary = set->osds[0];
 	} else {
-		for (i = len - 1; i >= 0; i--) {
-			if (ceph_osd_is_down(osdmap, osds[i]))
-				osds[i] = CRUSH_ITEM_NONE;
+		/* set down/dne devices to NONE */
+		for (i = set->size - 1; i >= 0; i--) {
+			if (ceph_osd_is_down(osdmap, set->osds[i]))
+				set->osds[i] = CRUSH_ITEM_NONE;
 			else
-				up_primary = osds[i];
+				set->primary = set->osds[i];
 		}
 	}
-
-	*primary = up_primary;
-	return len;
 }
 
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
-				   struct ceph_pg_pool_info *pool,
-				   int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+				   struct ceph_pg_pool_info *pi,
+				   u32 pps,
+				   struct ceph_osds *up)
 {
 	int i;
 	int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	if (!osdmap->osd_primary_affinity)
 		return;
 
-	for (i = 0; i < len; i++) {
-		int osd = osds[i];
+	for (i = 0; i < up->size; i++) {
+		int osd = up->osds[i];
 
 		if (osd != CRUSH_ITEM_NONE &&
 		    osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 			break;
 		}
 	}
-	if (i == len)
+	if (i == up->size)
 		return;
 
 	/*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	 * osd into the hash/rng so that a proportional fraction of an
 	 * osd's pgs get rejected as primary.
 	 */
-	for (i = 0; i < len; i++) {
-		int osd = osds[i];
+	for (i = 0; i < up->size; i++) {
+		int osd = up->osds[i];
 		u32 aff;
 
 		if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
 	if (pos < 0)
 		return;
 
-	*primary = osds[pos];
+	up->primary = up->osds[pos];
 
-	if (ceph_can_shift_osds(pool) && pos > 0) {
+	if (ceph_can_shift_osds(pi) && pos > 0) {
 		/* move the new primary to the front */
 		for (i = pos; i > 0; i--)
-			osds[i] = osds[i - 1];
-		osds[0] = *primary;
+			up->osds[i] = up->osds[i - 1];
+		up->osds[0] = up->primary;
 	}
 }
 
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
  *
- * Return acting set length.  *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
  */
-static int apply_temps(struct ceph_osdmap *osdmap,
-		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
-		       int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+			  struct ceph_pg_pool_info *pi,
+			  const struct ceph_pg *raw_pgid,
+			  struct ceph_osds *temp)
 {
+	struct ceph_pg pgid;
 	struct ceph_pg_mapping *pg;
-	int temp_len;
-	int temp_primary;
 	int i;
 
-	/* raw_pg -> pg */
-	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-				    pool->pg_num_mask);
+	raw_pg_to_pg(pi, raw_pgid, &pgid);
+	ceph_osds_init(temp);
 
 	/* pg_temp? */
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
-		temp_len = 0;
-		temp_primary = -1;
-
 		for (i = 0; i < pg->pg_temp.len; i++) {
 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-				if (ceph_can_shift_osds(pool))
+				if (ceph_can_shift_osds(pi))
 					continue;
-				else
-					osds[temp_len++] = CRUSH_ITEM_NONE;
+
+				temp->osds[temp->size++] = CRUSH_ITEM_NONE;
 			} else {
-				osds[temp_len++] = pg->pg_temp.osds[i];
+				temp->osds[temp->size++] = pg->pg_temp.osds[i];
 			}
 		}
 
 		/* apply pg_temp's primary */
-		for (i = 0; i < temp_len; i++) {
-			if (osds[i] != CRUSH_ITEM_NONE) {
-				temp_primary = osds[i];
+		for (i = 0; i < temp->size; i++) {
+			if (temp->osds[i] != CRUSH_ITEM_NONE) {
+				temp->primary = temp->osds[i];
 				break;
 			}
 		}
-	} else {
-		temp_len = len;
-		temp_primary = *primary;
 	}
 
 	/* primary_temp? */
 	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
 	if (pg)
-		temp_primary = pg->primary_temp.osd;
-
-	*primary = temp_primary;
-	return temp_len;
+		temp->primary = pg->primary_temp.osd;
 }
 
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
  *
- * Return acting set length, or error.  *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       const struct ceph_pg *raw_pgid,
+			       struct ceph_osds *up,
+			       struct ceph_osds *acting)
 {
-	struct ceph_pg_pool_info *pool;
+	struct ceph_pg_pool_info *pi;
 	u32 pps;
-	int len;
 
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-	if (!pool) {
-		*primary = -1;
-		return -ENOENT;
+	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+	if (!pi) {
+		ceph_osds_init(up);
+		ceph_osds_init(acting);
+		goto out;
 	}
 
-	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-		/* hash pool id and seed so that pool PGs do not overlap */
-		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-				     ceph_stable_mod(pgid.seed, pool->pgp_num,
-						     pool->pgp_num_mask),
-				     pgid.pool);
-	} else {
-		/*
-		 * legacy behavior: add ps and pool together.  this is
-		 * not a great approach because the PGs from each pool
-		 * will overlap on top of each other: 0.5 == 1.4 ==
-		 * 2.3 == ...
-		 */
-		pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-				      pool->pgp_num_mask) +
-			(unsigned)pgid.pool;
-	}
-
-	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-	if (len < 0) {
-		*primary = -1;
-		return len;
+	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+	raw_to_up_osds(osdmap, pi, up);
+	apply_primary_affinity(osdmap, pi, pps, up);
+	get_temp_osds(osdmap, pi, raw_pgid, acting);
+	if (!acting->size) {
+		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+		acting->size = up->size;
+		if (acting->primary == -1)
+			acting->primary = up->primary;
 	}
-
-	len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
-	apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
-	len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
-	return len;
+out:
+	WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid)
 {
-	int osds[CEPH_PG_MAX_SIZE];
-	int primary;
-
-	ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+	struct ceph_osds up, acting;
 
-	return primary;
+	ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+	return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
diff --git a/net/compat.c b/net/compat.c
index 5cfd26a0006f..1cd2ec046164 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -309,8 +309,8 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
 	__scm_destroy(scm);
 }
 
-static int do_set_attach_filter(struct socket *sock, int level, int optname,
-				char __user *optval, unsigned int optlen)
+/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */
+struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval)
 {
 	struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
 	struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
@@ -323,6 +323,19 @@ static int do_set_attach_filter(struct socket *sock, int level, int optname,
 	    __get_user(ptr, &fprog32->filter) ||
 	    __put_user(len, &kfprog->len) ||
 	    __put_user(compat_ptr(ptr), &kfprog->filter))
+		return NULL;
+
+	return kfprog;
+}
+EXPORT_SYMBOL_GPL(get_compat_bpf_fprog);
+
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	struct sock_fprog __user *kfprog;
+
+	kfprog = get_compat_bpf_fprog(optval);
+	if (!kfprog)
 		return -EFAULT;
 
 	return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
@@ -354,7 +367,8 @@ static int do_set_sock_timeout(struct socket *sock, int level,
 static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
 				char __user *optval, unsigned int optlen)
 {
-	if (optname == SO_ATTACH_FILTER)
+	if (optname == SO_ATTACH_FILTER ||
+	    optname == SO_ATTACH_REUSEPORT_CBPF)
 		return do_set_attach_filter(sock, level, optname,
 					    optval, optlen);
 	if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index bdb4013581b1..f4034817d255 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -84,8 +84,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
 	[NETIF_F_GSO_GRE_CSUM_BIT] =	 "tx-gre-csum-segmentation",
-	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
-	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
+	[NETIF_F_GSO_IPXIP4_BIT] =	 "tx-ipxip4-segmentation",
+	[NETIF_F_GSO_IPXIP6_BIT] =	 "tx-ipxip6-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index f96ee8b9478d..be873e4e3125 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -47,6 +47,7 @@ nla_put_failure:
  * @xstats_type: TLV type for backward compatibility xstats TLV
  * @lock: statistics lock
  * @d: dumping handle
+ * @padattr: padding attribute
  *
  * Initializes the dumping handle, grabs the statistic lock and appends
  * an empty TLV header to the socket buffer for use a container for all
@@ -87,6 +88,7 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat);
  * @type: TLV type for top level statistic TLV
  * @lock: statistics lock
  * @d: dumping handle
+ * @padattr: padding attribute
  *
  * Initializes the dumping handle, grabs the statistic lock and appends
  * an empty TLV header to the socket buffer for use a container for all
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index 941c28486896..2cab489ae62e 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -55,18 +55,21 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
 	spin_lock_irqsave(&bm_pool->lock, flags);
 	if (bm_pool->buf_num == bm_pool->size) {
 		pr_warn("pool already filled\n");
+		spin_unlock_irqrestore(&bm_pool->lock, flags);
 		return bm_pool->buf_num;
 	}
 
 	if (buf_num + bm_pool->buf_num > bm_pool->size) {
 		pr_warn("cannot allocate %d buffers for pool\n",
 			buf_num);
+		spin_unlock_irqrestore(&bm_pool->lock, flags);
 		return 0;
 	}
 
 	if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
 		pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
 			buf_num,  bm_pool->buf_num);
+		spin_unlock_irqrestore(&bm_pool->lock, flags);
 		return 0;
 	}
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2b3f76fe65f4..7a0b616557ab 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
 #include <linux/jiffies.h>
 #include <linux/pm_runtime.h>
 #include <linux/of.h>
+#include <linux/of_net.h>
 
 #include "net-sysfs.h"
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8604ae245960..8b02df0d354d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 	hrtimer_set_expires(&t.timer, spin_until);
 
 	remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
-	if (remaining <= 0) {
-		pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
-		return;
-	}
+	if (remaining <= 0)
+		goto out;
 
 	start_time = ktime_get();
 	if (remaining < 100000) {
@@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 	}
 
 	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
 	pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+	destroy_hrtimer_on_stack(&t.timer);
 }
 
 static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index c79b85eb4d4c..8737412c7b27 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -281,7 +281,7 @@ static int __init init_dns_resolver(void)
 				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL);
+				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto failed_put_cred;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index ca207dbf673b..116187b5c267 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -1289,8 +1289,8 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
 				     nl802154_dev_addr_policy))
 		return -EINVAL;
 
-	if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] &&
-	    !attrs[NL802154_DEV_ADDR_ATTR_MODE] &&
+	if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] ||
+	    !attrs[NL802154_DEV_ADDR_ATTR_MODE] ||
 	    !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] ||
 	      attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]))
 		return -EINVAL;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2e6e65fc4d20..d39e9e47a26e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,8 +1192,8 @@ int inet_sk_rebuild_header(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_sk_rebuild_header);
 
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
-					netdev_features_t features)
+struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+				 netdev_features_t features)
 {
 	bool udpfrag = false, fixedid = false, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
@@ -1205,24 +1205,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	int ihl;
 	int id;
 
-	if (unlikely(skb_shinfo(skb)->gso_type &
-		     ~(SKB_GSO_TCPV4 |
-		       SKB_GSO_UDP |
-		       SKB_GSO_DODGY |
-		       SKB_GSO_TCP_ECN |
-		       SKB_GSO_GRE |
-		       SKB_GSO_GRE_CSUM |
-		       SKB_GSO_IPIP |
-		       SKB_GSO_SIT |
-		       SKB_GSO_TCPV6 |
-		       SKB_GSO_UDP_TUNNEL |
-		       SKB_GSO_UDP_TUNNEL_CSUM |
-		       SKB_GSO_TCP_FIXEDID |
-		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_PARTIAL |
-		       0)))
-		goto out;
-
 	skb_reset_network_header(skb);
 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
 	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
@@ -1298,9 +1280,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 out:
 	return segs;
 }
+EXPORT_SYMBOL(inet_gso_segment);
 
-static struct sk_buff **inet_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
 	const struct net_offload *ops;
 	struct sk_buff **pp = NULL;
@@ -1416,6 +1398,7 @@ out:
 
 	return pp;
 }
+EXPORT_SYMBOL(inet_gro_receive);
 
 static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
@@ -1467,7 +1450,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	return -EINVAL;
 }
 
-static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	__be16 newlen = htons(skb->len - nhoff);
 	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
@@ -1497,11 +1480,12 @@ out_unlock:
 
 	return err;
 }
+EXPORT_SYMBOL(inet_gro_complete);
 
 static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	skb->encapsulation = 1;
-	skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
 	return inet_gro_complete(skb, nhoff);
 }
 
@@ -1697,6 +1681,14 @@ static __net_init int inet_init_net(struct net *net)
 	 */
 	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
 	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+
+	/* Default values for sysctl-controlled parameters.
+	 * We set them here, in case sysctl is not compiled.
+	 */
+	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+	net->ipv4.sysctl_ip_dynaddr = 0;
+	net->ipv4.sysctl_ip_early_demux = 1;
+
 	return 0;
 }
 
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index eeec7d60e5fd..5f9207c039e7 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -21,6 +21,7 @@ struct fou {
 	u8 protocol;
 	u8 flags;
 	__be16 port;
+	u8 family;
 	u16 type;
 	struct list_head list;
 	struct rcu_head rcu;
@@ -47,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
 	return sk->sk_user_data;
 }
 
-static int fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
 {
-	struct iphdr *iph = ip_hdr(skb);
-
 	/* Remove 'len' bytes from the packet (UDP header and
 	 * FOU header if present).
 	 */
-	iph->tot_len = htons(ntohs(iph->tot_len) - len);
+	if (fou->family == AF_INET)
+		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+	else
+		ipv6_hdr(skb)->payload_len =
+		    htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
 	__skb_pull(skb, len);
 	skb_postpull_rcsum(skb, udp_hdr(skb), len);
 	skb_reset_transport_header(skb);
@@ -68,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
 	if (!fou)
 		return 1;
 
-	if (fou_recv_pull(skb, sizeof(struct udphdr)))
+	if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
 		goto drop;
 
 	return -fou->protocol;
@@ -141,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 
 	hdrlen = sizeof(struct guehdr) + optlen;
 
-	ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+	if (fou->family == AF_INET)
+		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+	else
+		ipv6_hdr(skb)->payload_len =
+		    htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
 
 	/* Pull csum through the guehdr now . This can be used if
 	 * there is a remote checksum offload.
@@ -426,7 +434,8 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
 
 	mutex_lock(&fn->fou_lock);
 	list_for_each_entry(fout, &fn->fou_list, list) {
-		if (fou->port == fout->port) {
+		if (fou->port == fout->port &&
+		    fou->family == fout->family) {
 			mutex_unlock(&fn->fou_lock);
 			return -EALREADY;
 		}
@@ -448,31 +457,13 @@ static void fou_release(struct fou *fou)
 	kfree_rcu(fou, rcu);
 }
 
-static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
-	udp_sk(sk)->encap_rcv = fou_udp_recv;
-	udp_sk(sk)->gro_receive = fou_gro_receive;
-	udp_sk(sk)->gro_complete = fou_gro_complete;
-	fou_from_sock(sk)->protocol = cfg->protocol;
-
-	return 0;
-}
-
-static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
-	udp_sk(sk)->encap_rcv = gue_udp_recv;
-	udp_sk(sk)->gro_receive = gue_gro_receive;
-	udp_sk(sk)->gro_complete = gue_gro_complete;
-
-	return 0;
-}
-
 static int fou_create(struct net *net, struct fou_cfg *cfg,
 		      struct socket **sockp)
 {
 	struct socket *sock = NULL;
 	struct fou *fou = NULL;
 	struct sock *sk;
+	struct udp_tunnel_sock_cfg tunnel_cfg;
 	int err;
 
 	/* Open UDP socket */
@@ -489,35 +480,36 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
 	sk = sock->sk;
 
-	fou->flags = cfg->flags;
 	fou->port = cfg->udp_config.local_udp_port;
+	fou->family = cfg->udp_config.family;
+	fou->flags = cfg->flags;
+	fou->type = cfg->type;
+	fou->sock = sock;
+
+	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+	tunnel_cfg.encap_type = 1;
+	tunnel_cfg.sk_user_data = fou;
+	tunnel_cfg.encap_destroy = NULL;
 
 	/* Initial for fou type */
 	switch (cfg->type) {
 	case FOU_ENCAP_DIRECT:
-		err = fou_encap_init(sk, fou, cfg);
-		if (err)
-			goto error;
+		tunnel_cfg.encap_rcv = fou_udp_recv;
+		tunnel_cfg.gro_receive = fou_gro_receive;
+		tunnel_cfg.gro_complete = fou_gro_complete;
+		fou->protocol = cfg->protocol;
 		break;
 	case FOU_ENCAP_GUE:
-		err = gue_encap_init(sk, fou, cfg);
-		if (err)
-			goto error;
+		tunnel_cfg.encap_rcv = gue_udp_recv;
+		tunnel_cfg.gro_receive = gue_gro_receive;
+		tunnel_cfg.gro_complete = gue_gro_complete;
 		break;
 	default:
 		err = -EINVAL;
 		goto error;
 	}
 
-	fou->type = cfg->type;
-
-	udp_sk(sk)->encap_type = 1;
-	udp_encap_enable();
-
-	sk->sk_user_data = fou;
-	fou->sock = sock;
-
-	inet_inc_convert_csum(sk);
+	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
 	sk->sk_allocation = GFP_ATOMIC;
 
@@ -542,12 +534,13 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 {
 	struct fou_net *fn = net_generic(net, fou_net_id);
 	__be16 port = cfg->udp_config.local_udp_port;
+	u8 family = cfg->udp_config.family;
 	int err = -EINVAL;
 	struct fou *fou;
 
 	mutex_lock(&fn->fou_lock);
 	list_for_each_entry(fou, &fn->fou_list, list) {
-		if (fou->port == port) {
+		if (fou->port == port && fou->family == family) {
 			fou_release(fou);
 			err = 0;
 			break;
@@ -585,8 +578,15 @@ static int parse_nl_config(struct genl_info *info,
 	if (info->attrs[FOU_ATTR_AF]) {
 		u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
 
-		if (family != AF_INET)
-			return -EINVAL;
+		switch (family) {
+		case AF_INET:
+			break;
+		case AF_INET6:
+			cfg->udp_config.ipv6_v6only = 1;
+			break;
+		default:
+			return -EAFNOSUPPORT;
+		}
 
 		cfg->udp_config.family = family;
 	}
@@ -677,6 +677,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
 	struct fou_cfg cfg;
 	struct fou *fout;
 	__be16 port;
+	u8 family;
 	int ret;
 
 	ret = parse_nl_config(info, &cfg);
@@ -686,6 +687,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
 	if (port == 0)
 		return -EINVAL;
 
+	family = cfg.udp_config.family;
+	if (family != AF_INET && family != AF_INET6)
+		return -EINVAL;
+
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
 		return -ENOMEM;
@@ -693,7 +698,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
 	ret = -ESRCH;
 	mutex_lock(&fn->fou_lock);
 	list_for_each_entry(fout, &fn->fou_list, list) {
-		if (port == fout->port) {
+		if (port == fout->port && family == fout->family) {
 			ret = fou_dump_info(fout, info->snd_portid,
 					    info->snd_seq, 0, msg,
 					    info->genlhdr->cmd);
@@ -798,6 +803,22 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	*protocol = IPPROTO_UDP;
 }
 
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		       u8 *protocol, __be16 *sport, int type)
+{
+	int err;
+
+	err = iptunnel_handle_offloads(skb, type);
+	if (err)
+		return err;
+
+	*sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+						skb, 0, 0, false);
+
+	return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
 int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		     u8 *protocol, struct flowi4 *fl4)
 {
@@ -806,26 +827,21 @@ int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	__be16 sport;
 	int err;
 
-	err = iptunnel_handle_offloads(skb, type);
+	err = __fou_build_header(skb, e, protocol, &sport, type);
 	if (err)
 		return err;
 
-	sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
-					       skb, 0, 0, false);
 	fou_build_udp(skb, e, fl4, protocol, sport);
 
 	return 0;
 }
 EXPORT_SYMBOL(fou_build_header);
 
-int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
-		     u8 *protocol, struct flowi4 *fl4)
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		       u8 *protocol, __be16 *sport, int type)
 {
-	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
-						       SKB_GSO_UDP_TUNNEL;
 	struct guehdr *guehdr;
 	size_t hdrlen, optlen = 0;
-	__be16 sport;
 	void *data;
 	bool need_priv = false;
 	int err;
@@ -844,8 +860,8 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		return err;
 
 	/* Get source port (based on flow hash) before skb_push */
-	sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
-					       skb, 0, 0, false);
+	*sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+						skb, 0, 0, false);
 
 	hdrlen = sizeof(struct guehdr) + optlen;
 
@@ -890,6 +906,22 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 
 	}
 
+	return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		     u8 *protocol, struct flowi4 *fl4)
+{
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+						       SKB_GSO_UDP_TUNNEL;
+	__be16 sport;
+	int err;
+
+	err = __gue_build_header(skb, e, protocol, &sport, type);
+	if (err)
+		return err;
+
 	fou_build_udp(skb, e, fl4, protocol, sport);
 
 	return 0;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e88190a8699a..ecd1e09dbbf1 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -26,20 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	int gre_offset, outer_hlen;
 	bool need_csum, ufo;
 
-	if (unlikely(skb_shinfo(skb)->gso_type &
-				~(SKB_GSO_TCPV4 |
-				  SKB_GSO_TCPV6 |
-				  SKB_GSO_UDP |
-				  SKB_GSO_DODGY |
-				  SKB_GSO_TCP_ECN |
-				  SKB_GSO_TCP_FIXEDID |
-				  SKB_GSO_GRE |
-				  SKB_GSO_GRE_CSUM |
-				  SKB_GSO_IPIP |
-				  SKB_GSO_SIT |
-				  SKB_GSO_PARTIAL)))
-		goto out;
-
 	if (!skb->encapsulation)
 		goto out;
 
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a69ed94bda1b..d8f5e0a269f5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -443,29 +443,6 @@ drop:
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 
-static int ip_encap_hlen(struct ip_tunnel_encap *e)
-{
-	const struct ip_tunnel_encap_ops *ops;
-	int hlen = -EINVAL;
-
-	if (e->type == TUNNEL_ENCAP_NONE)
-		return 0;
-
-	if (e->type >= MAX_IPTUN_ENCAP_OPS)
-		return -EINVAL;
-
-	rcu_read_lock();
-	ops = rcu_dereference(iptun_encaps[e->type]);
-	if (likely(ops && ops->encap_hlen))
-		hlen = ops->encap_hlen(e);
-	rcu_read_unlock();
-
-	return hlen;
-}
-
-const struct ip_tunnel_encap_ops __rcu *
-		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
-
 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 			    unsigned int num)
 {
@@ -519,28 +496,6 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t,
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 
-int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
-		    u8 *protocol, struct flowi4 *fl4)
-{
-	const struct ip_tunnel_encap_ops *ops;
-	int ret = -EINVAL;
-
-	if (t->encap.type == TUNNEL_ENCAP_NONE)
-		return 0;
-
-	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
-		return -EINVAL;
-
-	rcu_read_lock();
-	ops = rcu_dereference(iptun_encaps[t->encap.type]);
-	if (likely(ops && ops->build_header))
-		ret = ops->build_header(skb, &t->encap, protocol, fl4);
-	rcu_read_unlock();
-
-	return ret;
-}
-EXPORT_SYMBOL(ip_tunnel_encap);
-
 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 			    struct rtable *rt, __be16 df,
 			    const struct iphdr *inner_iph)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 9118b0e640ba..afd6b5968caf 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -37,6 +37,7 @@
 #include <net/icmp.h>
 #include <net/protocol.h>
 #include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
 #include <net/arp.h>
 #include <net/checksum.h>
 #include <net/dsfield.h>
@@ -47,6 +48,14 @@
 #include <net/rtnetlink.h>
 #include <net/dst_metadata.h>
 
+const struct ip_tunnel_encap_ops __rcu *
+		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(iptun_encaps);
+
+const struct ip6_tnl_encap_ops __rcu *
+		ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(ip6tun_encaps);
+
 void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, __u8 proto,
 		   __u8 tos, __u8 ttl, __be16 df, bool xnet)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 92827483ee3d..978370132f29 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index bb0419582b8d..1cb67de106fe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -999,10 +999,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 	if (!net->ipv4.sysctl_local_reserved_ports)
 		goto err_ports;
 
-	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
-	net->ipv4.sysctl_ip_dynaddr = 0;
-	net->ipv4.sysctl_ip_early_demux = 1;
-
 	return 0;
 
 err_ports:
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 02737b607aa7..5c5964962d0c 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -83,25 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
 	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
 		/* Packet is from an untrusted source, reset gso_segs. */
-		int type = skb_shinfo(skb)->gso_type;
-
-		if (unlikely(type &
-			     ~(SKB_GSO_TCPV4 |
-			       SKB_GSO_DODGY |
-			       SKB_GSO_TCP_ECN |
-			       SKB_GSO_TCP_FIXEDID |
-			       SKB_GSO_TCPV6 |
-			       SKB_GSO_GRE |
-			       SKB_GSO_GRE_CSUM |
-			       SKB_GSO_IPIP |
-			       SKB_GSO_SIT |
-			       SKB_GSO_UDP_TUNNEL |
-			       SKB_GSO_UDP_TUNNEL_CSUM |
-			       SKB_GSO_TUNNEL_REMCSUM |
-			       0) ||
-			     !(type & (SKB_GSO_TCPV4 |
-				       SKB_GSO_TCPV6))))
-			goto out;
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2e3ebfe5549e..0ff31d97d485 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1565,7 +1565,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 		/* if we're overly short, let UDP handle it */
 		encap_rcv = ACCESS_ONCE(up->encap_rcv);
-		if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+		if (encap_rcv) {
 			int ret;
 
 			/* Verify checksum before giving to encap */
@@ -1618,12 +1618,12 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
-	if (rcu_access_pointer(sk->sk_filter)) {
-		if (udp_lib_checksum_complete(skb))
+	if (rcu_access_pointer(sk->sk_filter) &&
+	    udp_lib_checksum_complete(skb))
 			goto csum_error;
-		if (sk_filter(sk, skb))
-			goto drop;
-	}
+
+	if (sk_filter(sk, skb))
+		goto drop;
 
 	udp_csum_pull_header(skb);
 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6b7459c92bb2..81f253b6ff36 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -209,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 
 	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
 		/* Packet is from an untrusted source, reset gso_segs. */
-		int type = skb_shinfo(skb)->gso_type;
-
-		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
-				      SKB_GSO_UDP_TUNNEL |
-				      SKB_GSO_UDP_TUNNEL_CSUM |
-				      SKB_GSO_TUNNEL_REMCSUM |
-				      SKB_GSO_IPIP |
-				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
-			     !(type & (SKB_GSO_UDP))))
-			goto out;
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 3f8411328de5..2343e4f2e0bf 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -232,6 +232,15 @@ config IPV6_GRE
 
 	  Saying M here will produce a module called ip6_gre. If unsure, say N.
 
+config IPV6_FOU
+	tristate
+	default NET_FOU && IPV6
+
+config IPV6_FOU_TUNNEL
+	tristate
+	default NET_FOU_IP_TUNNELS && IPV6_FOU
+	select IPV6_TUNNEL
+
 config IPV6_MULTIPLE_TABLES
 	bool "IPv6: Multiple Routing Tables"
 	select FIB_RULES
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 5e9d6bf4aaca..6d8ea099213e 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
 obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
+obj-$(CONFIG_IPV6_FOU) += fou6.o
 
 obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
 obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
new file mode 100644
index 000000000000..9ea249b9451e
--- /dev/null
+++ b/net/ipv6/fou6.c
@@ -0,0 +1,140 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/fou.h>
+#include <net/ip.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+
+static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			   struct flowi6 *fl6, u8 *protocol, __be16 sport)
+{
+	struct udphdr *uh;
+
+	skb_push(skb, sizeof(struct udphdr));
+	skb_reset_transport_header(skb);
+
+	uh = udp_hdr(skb);
+
+	uh->dest = e->dport;
+	uh->source = sport;
+	uh->len = htons(skb->len);
+	udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb,
+		      &fl6->saddr, &fl6->daddr, skb->len);
+
+	*protocol = IPPROTO_UDP;
+}
+
+int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		      u8 *protocol, struct flowi6 *fl6)
+{
+	__be16 sport;
+	int err;
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+		SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+	err = __fou_build_header(skb, e, protocol, &sport, type);
+	if (err)
+		return err;
+
+	fou6_build_udp(skb, e, fl6, protocol, sport);
+
+	return 0;
+}
+EXPORT_SYMBOL(fou6_build_header);
+
+int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		      u8 *protocol, struct flowi6 *fl6)
+{
+	__be16 sport;
+	int err;
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+		SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+	err = __gue_build_header(skb, e, protocol, &sport, type);
+	if (err)
+		return err;
+
+	fou6_build_udp(skb, e, fl6, protocol, sport);
+
+	return 0;
+}
+EXPORT_SYMBOL(gue6_build_header);
+
+#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL)
+
+static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
+	.encap_hlen = fou_encap_hlen,
+	.build_header = fou6_build_header,
+};
+
+static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
+	.encap_hlen = gue_encap_hlen,
+	.build_header = gue6_build_header,
+};
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+	int ret;
+
+	ret = ip6_tnl_encap_add_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+	if (ret < 0) {
+		pr_err("can't add fou6 ops\n");
+		return ret;
+	}
+
+	ret = ip6_tnl_encap_add_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+	if (ret < 0) {
+		pr_err("can't add gue6 ops\n");
+		ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+	ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+	ip6_tnl_encap_del_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+	return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static int __init fou6_init(void)
+{
+	int ret;
+
+	ret = ip6_tnl_encap_add_fou_ops();
+
+	return ret;
+}
+
+static void __exit fou6_fini(void)
+{
+	ip6_tnl_encap_del_fou_ops();
+}
+
+module_init(fou6_init);
+module_exit(fou6_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4541fa54035e..fdc9de276ab1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -712,6 +712,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 	fl6->daddr = p->raddr;
 	fl6->flowi6_oif = p->link;
 	fl6->flowlabel = 0;
+	fl6->flowi6_proto = IPPROTO_GRE;
 
 	if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
 		fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
@@ -729,7 +730,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 
 	t->tun_hlen = gre_calc_hlen(t->parms.o_flags);
 
-	t->hlen = t->tun_hlen;
+	t->hlen = t->encap_hlen + t->tun_hlen;
 
 	t_hlen = t->hlen + sizeof(struct ipv6hdr);
 
@@ -1022,13 +1023,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	}
 
 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
-
-	tunnel->hlen = tunnel->tun_hlen;
-
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
 
 	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
 	dev->mtu = ETH_DATA_LEN - t_hlen;
+	if (dev->type == ARPHRD_ETHER)
+		dev->mtu -= ETH_HLEN;
 	if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		dev->mtu -= 8;
 
@@ -1255,6 +1256,8 @@ static int ip6gre_tap_init(struct net_device *dev)
 	if (ret)
 		return ret;
 
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
 	tunnel = netdev_priv(dev);
 
 	ip6gre_tnl_link_config(tunnel, 1);
@@ -1288,6 +1291,40 @@ static void ip6gre_tap_setup(struct net_device *dev)
 
 	dev->features |= NETIF_F_NETNS_LOCAL;
 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+}
+
+static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
+				       struct ip_tunnel_encap *ipencap)
+{
+	bool ret = false;
+
+	memset(ipencap, 0, sizeof(*ipencap));
+
+	if (!data)
+		return ret;
+
+	if (data[IFLA_GRE_ENCAP_TYPE]) {
+		ret = true;
+		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+	}
+
+	if (data[IFLA_GRE_ENCAP_FLAGS]) {
+		ret = true;
+		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+	}
+
+	if (data[IFLA_GRE_ENCAP_SPORT]) {
+		ret = true;
+		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+	}
+
+	if (data[IFLA_GRE_ENCAP_DPORT]) {
+		ret = true;
+		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+	}
+
+	return ret;
 }
 
 static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
@@ -1296,9 +1333,18 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	struct ip6_tnl *nt;
 	struct net *net = dev_net(dev);
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+	struct ip_tunnel_encap ipencap;
 	int err;
 
 	nt = netdev_priv(dev);
+
+	if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+		int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
+
 	ip6gre_netlink_parms(data, &nt->parms);
 
 	if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
@@ -1315,11 +1361,15 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	dev->hw_features	|= GRE6_FEATURES;
 
 	if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
-		/* TCP segmentation offload is not supported when we
-		 * generate output sequences.
+		/* TCP offload with GRE SEQ is not supported, nor
+		 * can we support 2 levels of outer headers requiring
+		 * an update.
 		 */
-		dev->features    |= NETIF_F_GSO_SOFTWARE;
-		dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+		if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
+		    (nt->encap.type == TUNNEL_ENCAP_NONE)) {
+			dev->features    |= NETIF_F_GSO_SOFTWARE;
+			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+		}
 
 		/* Can use a lockless transmit, unless we generate
 		 * output sequences
@@ -1345,10 +1395,18 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct net *net = nt->net;
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
 	struct __ip6_tnl_parm p;
+	struct ip_tunnel_encap ipencap;
 
 	if (dev == ign->fb_tunnel_dev)
 		return -EINVAL;
 
+	if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+		int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
+
 	ip6gre_netlink_parms(data, &p);
 
 	t = ip6gre_tunnel_locate(net, &p, 0);
@@ -1400,6 +1458,14 @@ static size_t ip6gre_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_GRE_FLAGS */
 		nla_total_size(4) +
+		/* IFLA_GRE_ENCAP_TYPE */
+		nla_total_size(2) +
+		/* IFLA_GRE_ENCAP_FLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_ENCAP_SPORT */
+		nla_total_size(2) +
+		/* IFLA_GRE_ENCAP_DPORT */
+		nla_total_size(2) +
 		0;
 }
 
@@ -1422,6 +1488,17 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
 	    nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags))
 		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+			t->encap.type) ||
+	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+			 t->encap.sport) ||
+	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+			 t->encap.dport) ||
+	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+			t->encap.flags))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -1440,6 +1517,10 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
 	[IFLA_GRE_FLOWINFO]    = { .type = NLA_U32 },
 	[IFLA_GRE_FLAGS]       = { .type = NLA_U32 },
+	[IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
+	[IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
+	[IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
+	[IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index f185cbcda114..94611e450ec9 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -223,6 +223,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
 	unsigned int nhoff;
 	int nexthdr;
 	bool raw;
+	bool have_final = false;
 
 	/*
 	 *	Parse extension headers
@@ -236,14 +237,27 @@ resubmit:
 	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb_network_header(skb)[nhoff];
 
+resubmit_final:
 	raw = raw6_local_deliver(skb, nexthdr);
 	ipprot = rcu_dereference(inet6_protos[nexthdr]);
 	if (ipprot) {
 		int ret;
 
-		if (ipprot->flags & INET6_PROTO_FINAL) {
+		if (have_final) {
+			if (!(ipprot->flags & INET6_PROTO_FINAL)) {
+				/* Once we've seen a final protocol don't
+				 * allow encapsulation on any non-final
+				 * ones. This allows foo in UDP encapsulation
+				 * to work.
+				 */
+				goto discard;
+			}
+		} else if (ipprot->flags & INET6_PROTO_FINAL) {
 			const struct ipv6hdr *hdr;
 
+			/* Only do this once for first final protocol */
+			have_final = true;
+
 			/* Free reference early: we don't need it any more,
 			   and it may hold ip_conntrack module loaded
 			   indefinitely. */
@@ -263,10 +277,21 @@ resubmit:
 			goto discard;
 
 		ret = ipprot->handler(skb);
-		if (ret > 0)
-			goto resubmit;
-		else if (ret == 0)
+		if (ret > 0) {
+			if (ipprot->flags & INET6_PROTO_FINAL) {
+				/* Not an extension header, most likely UDP
+				 * encapsulation. Use return value as nexthdr
+				 * protocol not nhoff (which presumably is
+				 * not set by handler).
+				 */
+				nexthdr = ret;
+				goto resubmit_final;
+			} else {
+				goto resubmit;
+			}
+		} else if (ret == 0) {
 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
+		}
 	} else {
 		if (!raw) {
 			if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index f5eb184e1093..22e90e56b5a9 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -16,6 +16,7 @@
 
 #include <net/protocol.h>
 #include <net/ipv6.h>
+#include <net/inet_common.h>
 
 #include "ip6_offload.h"
 
@@ -69,24 +70,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	bool encap, udpfrag;
 	int nhoff;
 
-	if (unlikely(skb_shinfo(skb)->gso_type &
-		     ~(SKB_GSO_TCPV4 |
-		       SKB_GSO_UDP |
-		       SKB_GSO_DODGY |
-		       SKB_GSO_TCP_ECN |
-		       SKB_GSO_TCP_FIXEDID |
-		       SKB_GSO_TCPV6 |
-		       SKB_GSO_GRE |
-		       SKB_GSO_GRE_CSUM |
-		       SKB_GSO_IPIP |
-		       SKB_GSO_SIT |
-		       SKB_GSO_UDP_TUNNEL |
-		       SKB_GSO_UDP_TUNNEL_CSUM |
-		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_PARTIAL |
-		       0)))
-		goto out;
-
 	skb_reset_network_header(skb);
 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
 	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
@@ -104,7 +87,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
 
 	if (skb->encapsulation &&
-	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
 		udpfrag = proto == IPPROTO_UDP && encap;
 	else
 		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
@@ -271,9 +254,11 @@ out:
 	return pp;
 }
 
-static struct sk_buff **sit_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
+					       struct sk_buff *skb)
 {
+	/* Common GRO receive for SIT and IP6IP6 */
+
 	if (NAPI_GRO_CB(skb)->encap_mark) {
 		NAPI_GRO_CB(skb)->flush = 1;
 		return NULL;
@@ -284,6 +269,21 @@ static struct sk_buff **sit_gro_receive(struct sk_buff **head,
 	return ipv6_gro_receive(head, skb);
 }
 
+static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
+					   struct sk_buff *skb)
+{
+	/* Common GRO receive for SIT and IP6IP6 */
+
+	if (NAPI_GRO_CB(skb)->encap_mark) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	NAPI_GRO_CB(skb)->encap_mark = 1;
+
+	return inet_gro_receive(head, skb);
+}
+
 static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	const struct net_offload *ops;
@@ -312,10 +312,24 @@ out_unlock:
 static int sit_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	skb->encapsulation = 1;
-	skb_shinfo(skb)->gso_type |= SKB_GSO_SIT;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
 	return ipv6_gro_complete(skb, nhoff);
 }
 
+static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	skb->encapsulation = 1;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+	return ipv6_gro_complete(skb, nhoff);
+}
+
+static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	skb->encapsulation = 1;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+	return inet_gro_complete(skb, nhoff);
+}
+
 static struct packet_offload ipv6_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
 	.callbacks = {
@@ -328,11 +342,26 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
 static const struct net_offload sit_offload = {
 	.callbacks = {
 		.gso_segment	= ipv6_gso_segment,
-		.gro_receive    = sit_gro_receive,
+		.gro_receive    = sit_ip6ip6_gro_receive,
 		.gro_complete   = sit_gro_complete,
 	},
 };
 
+static const struct net_offload ip4ip6_offload = {
+	.callbacks = {
+		.gso_segment	= inet_gso_segment,
+		.gro_receive    = ip4ip6_gro_receive,
+		.gro_complete   = ip4ip6_gro_complete,
+	},
+};
+
+static const struct net_offload ip6ip6_offload = {
+	.callbacks = {
+		.gso_segment	= ipv6_gso_segment,
+		.gro_receive    = sit_ip6ip6_gro_receive,
+		.gro_complete   = ip6ip6_gro_complete,
+	},
+};
 static int __init ipv6_offload_init(void)
 {
 
@@ -344,6 +373,8 @@ static int __init ipv6_offload_init(void)
 	dev_add_offload(&ipv6_packet_offload);
 
 	inet_add_offload(&sit_offload, IPPROTO_IPV6);
+	inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6);
+	inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP);
 
 	return 0;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index cbf127ae7c67..635b8d340cdb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1071,17 +1071,12 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 					 const struct in6_addr *final_dst)
 {
 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
-	int err;
 
 	dst = ip6_sk_dst_check(sk, dst, fl6);
+	if (!dst)
+		dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
 
-	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
-	if (err)
-		return ERR_PTR(err);
-	if (final_dst)
-		fl6->daddr = *final_dst;
-
-	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+	return dst;
 }
 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e79330f214bd..7b0481e3738f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1010,7 +1010,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	struct dst_entry *dst = NULL, *ndst = NULL;
 	struct net_device *tdev;
 	int mtu;
-	unsigned int max_headroom = sizeof(struct ipv6hdr);
+	unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
+	unsigned int max_headroom = psh_hlen;
 	int err = -1;
 
 	/* NBMA tunnel */
@@ -1063,7 +1064,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 				     t->parms.name);
 		goto tx_err_dst_release;
 	}
-	mtu = dst_mtu(dst) - sizeof(*ipv6h);
+	mtu = dst_mtu(dst) - psh_hlen;
 	if (encap_limit >= 0) {
 		max_headroom += 8;
 		mtu -= 8;
@@ -1119,16 +1120,18 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 		ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
 	}
 
-	if (likely(!skb->encapsulation)) {
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
-	}
-
+	/* Calculate max headroom for all the headers and adjust
+	 * needed_headroom if necessary.
+	 */
 	max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr)
-			+ dst->header_len;
+			+ dst->header_len + t->hlen;
 	if (max_headroom > dev->needed_headroom)
 		dev->needed_headroom = max_headroom;
 
+	err = ip6_tnl_encap(skb, t, &proto, fl6);
+	if (err)
+		return err;
+
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
@@ -1180,6 +1183,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
 		fl6.flowi6_mark = skb->mark;
 
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+		return -1;
+
+	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
 	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
 			   IPPROTO_IPIP);
 	if (err != 0) {
@@ -1234,6 +1242,11 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
 		fl6.flowi6_mark = skb->mark;
 
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+		return -1;
+
+	skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+
 	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
 			   IPPROTO_IPV6);
 	if (err != 0) {
@@ -1280,6 +1293,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 	struct net_device *dev = t->dev;
 	struct __ip6_tnl_parm *p = &t->parms;
 	struct flowi6 *fl6 = &t->fl.u.ip6;
+	int t_hlen;
 
 	memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
 	memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
@@ -1303,6 +1317,10 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 	else
 		dev->flags &= ~IFF_POINTOPOINT;
 
+	t->tun_hlen = 0;
+	t->hlen = t->encap_hlen + t->tun_hlen;
+	t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
 	if (p->flags & IP6_TNL_F_CAP_XMIT) {
 		int strict = (ipv6_addr_type(&p->raddr) &
 			      (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
@@ -1316,9 +1334,9 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 
 		if (rt->dst.dev) {
 			dev->hard_header_len = rt->dst.dev->hard_header_len +
-				sizeof(struct ipv6hdr);
+				t_hlen;
 
-			dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr);
+			dev->mtu = rt->dst.dev->mtu - t_hlen;
 			if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 				dev->mtu -= 8;
 
@@ -1564,6 +1582,59 @@ int ip6_tnl_get_iflink(const struct net_device *dev)
 }
 EXPORT_SYMBOL(ip6_tnl_get_iflink);
 
+int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops,
+			  unsigned int num)
+{
+	if (num >= MAX_IPTUN_ENCAP_OPS)
+		return -ERANGE;
+
+	return !cmpxchg((const struct ip6_tnl_encap_ops **)
+			&ip6tun_encaps[num],
+			NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_add_ops);
+
+int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops,
+			  unsigned int num)
+{
+	int ret;
+
+	if (num >= MAX_IPTUN_ENCAP_OPS)
+		return -ERANGE;
+
+	ret = (cmpxchg((const struct ip6_tnl_encap_ops **)
+		       &ip6tun_encaps[num],
+		       ops, NULL) == ops) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_del_ops);
+
+int ip6_tnl_encap_setup(struct ip6_tnl *t,
+			struct ip_tunnel_encap *ipencap)
+{
+	int hlen;
+
+	memset(&t->encap, 0, sizeof(t->encap));
+
+	hlen = ip6_encap_hlen(ipencap);
+	if (hlen < 0)
+		return hlen;
+
+	t->encap.type = ipencap->type;
+	t->encap.sport = ipencap->sport;
+	t->encap.dport = ipencap->dport;
+	t->encap.flags = ipencap->flags;
+
+	t->encap_hlen = hlen;
+	t->hlen = t->encap_hlen + t->tun_hlen;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+
 static const struct net_device_ops ip6_tnl_netdev_ops = {
 	.ndo_init	= ip6_tnl_dev_init,
 	.ndo_uninit	= ip6_tnl_dev_uninit,
@@ -1574,6 +1645,11 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
 	.ndo_get_iflink = ip6_tnl_get_iflink,
 };
 
+#define IPXIPX_FEATURES (NETIF_F_SG |		\
+			 NETIF_F_FRAGLIST |	\
+			 NETIF_F_HIGHDMA |	\
+			 NETIF_F_GSO_SOFTWARE |	\
+			 NETIF_F_HW_CSUM)
 
 /**
  * ip6_tnl_dev_setup - setup virtual tunnel device
@@ -1585,20 +1661,18 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
 
 static void ip6_tnl_dev_setup(struct net_device *dev)
 {
-	struct ip6_tnl *t;
-
 	dev->netdev_ops = &ip6_tnl_netdev_ops;
 	dev->destructor = ip6_dev_free;
 
 	dev->type = ARPHRD_TUNNEL6;
-	dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
-	dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr);
-	t = netdev_priv(dev);
-	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-		dev->mtu -= 8;
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
+	dev->features |= NETIF_F_LLTX;
 	netif_keep_dst(dev);
+
+	dev->features		|= IPXIPX_FEATURES;
+	dev->hw_features	|= IPXIPX_FEATURES;
+
 	/* This perm addr will be used as interface identifier by IPv6 */
 	dev->addr_assign_type = NET_ADDR_RANDOM;
 	eth_random_addr(dev->perm_addr);
@@ -1615,6 +1689,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	int ret;
+	int t_hlen;
 
 	t->dev = dev;
 	t->net = dev_net(dev);
@@ -1630,8 +1705,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
 	if (ret)
 		goto destroy_dst;
 
-	t->hlen = 0;
 	t->tun_hlen = 0;
+	t->hlen = t->encap_hlen + t->tun_hlen;
+	t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
+	dev->type = ARPHRD_TUNNEL6;
+	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	dev->mtu = ETH_DATA_LEN - t_hlen;
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		dev->mtu -= 8;
 
 	return 0;
 
@@ -1729,13 +1811,55 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
 		parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
 }
 
+static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
+					struct ip_tunnel_encap *ipencap)
+{
+	bool ret = false;
+
+	memset(ipencap, 0, sizeof(*ipencap));
+
+	if (!data)
+		return ret;
+
+	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+		ret = true;
+		ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+		ret = true;
+		ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+		ret = true;
+		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+		ret = true;
+		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+	}
+
+	return ret;
+}
+
 static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[])
 {
 	struct net *net = dev_net(dev);
 	struct ip6_tnl *nt, *t;
+	struct ip_tunnel_encap ipencap;
 
 	nt = netdev_priv(dev);
+
+	if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+		int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
+
 	ip6_tnl_netlink_parms(data, &nt->parms);
 
 	t = ip6_tnl_locate(net, &nt->parms, 0);
@@ -1752,10 +1876,17 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct __ip6_tnl_parm p;
 	struct net *net = t->net;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+	struct ip_tunnel_encap ipencap;
 
 	if (dev == ip6n->fb_tnl_dev)
 		return -EINVAL;
 
+	if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+		int err = ip6_tnl_encap_setup(t, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
 	ip6_tnl_netlink_parms(data, &p);
 
 	t = ip6_tnl_locate(net, &p, 0);
@@ -1796,6 +1927,14 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_IPTUN_PROTO */
 		nla_total_size(1) +
+		/* IFLA_IPTUN_ENCAP_TYPE */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_ENCAP_FLAGS */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_ENCAP_SPORT */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_ENCAP_DPORT */
+		nla_total_size(2) +
 		0;
 }
 
@@ -1813,6 +1952,17 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
 	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
 		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+			tunnel->encap.type) ||
+	nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+		     tunnel->encap.sport) ||
+	nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+		     tunnel->encap.dport) ||
+	nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+		    tunnel->encap.flags))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -1836,6 +1986,10 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_FLOWINFO]		= { .type = NLA_U32 },
 	[IFLA_IPTUN_FLAGS]		= { .type = NLA_U32 },
 	[IFLA_IPTUN_PROTO]		= { .type = NLA_U8 },
+	[IFLA_IPTUN_ENCAP_TYPE]		= { .type = NLA_U16 },
+	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ip6_link_ops __read_mostly = {
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 6989c70ae29f..4a84b5ad9ecb 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
 	fl6.daddr = *gw;
 	fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) |
 			(iph->flow_lbl[1] << 8) | iph->flow_lbl[2]);
+	fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
 		dst_release(dst);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a13d8c114ccb..0a5a255277e5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -913,7 +913,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		goto tx_error;
 	}
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_SIT)) {
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
@@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
 
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 		goto tx_error;
 
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 79e33e02f11a..f36c2d076fce 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1721,7 +1721,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	destp = ntohs(inet->inet_dport);
 	srcp  = ntohs(inet->inet_sport);
 
-	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2ba6a77a8815..f421c9f23c5b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -617,7 +617,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 		/* if we're overly short, let UDP handle it */
 		encap_rcv = ACCESS_ONCE(up->encap_rcv);
-		if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+		if (encap_rcv) {
 			int ret;
 
 			/* Verify checksum before giving to encap */
@@ -653,12 +653,12 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
-	if (rcu_access_pointer(sk->sk_filter)) {
-		if (udp_lib_checksum_complete(skb))
-			goto csum_error;
-		if (sk_filter(sk, skb))
-			goto drop;
-	}
+	if (rcu_access_pointer(sk->sk_filter) &&
+	    udp_lib_checksum_complete(skb))
+		goto csum_error;
+
+	if (sk_filter(sk, skb))
+		goto drop;
 
 	udp_csum_pull_header(skb);
 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 5429f6bcf047..ac858c480f2f 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -36,19 +36,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 
 	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
 		/* Packet is from an untrusted source, reset gso_segs. */
-		int type = skb_shinfo(skb)->gso_type;
-
-		if (unlikely(type & ~(SKB_GSO_UDP |
-				      SKB_GSO_DODGY |
-				      SKB_GSO_UDP_TUNNEL |
-				      SKB_GSO_UDP_TUNNEL_CSUM |
-				      SKB_GSO_TUNNEL_REMCSUM |
-				      SKB_GSO_GRE |
-				      SKB_GSO_GRE_CSUM |
-				      SKB_GSO_IPIP |
-				      SKB_GSO_SIT) ||
-			     !(type & (SKB_GSO_UDP))))
-			goto out;
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
 
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index da126ee6d218..873c4b707d6a 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -220,10 +220,11 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
 
 	/* Check if already open */
-	if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) {
+	if (tty_port_initialized(&self->port)) {
 		pr_debug("%s(), already open so break out!\n", __func__);
 		return 0;
 	}
+	tty_port_set_initialized(&self->port, 1);
 
 	/* Register with IrCOMM */
 	irda_notify_init(&notify);
@@ -257,7 +258,7 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
 
 	return 0;
 err:
-	clear_bit(ASYNCB_INITIALIZED, &self->port.flags);
+	tty_port_set_initialized(&self->port, 0);
 	return ret;
 }
 
@@ -280,8 +281,8 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	 * If non-blocking mode is set, or the port is not enabled,
 	 * then make the check up front and then exit.
 	 */
-	if (test_bit(TTY_IO_ERROR, &tty->flags)) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+	if (tty_io_error(tty)) {
+		tty_port_set_active(port, 1);
 		return 0;
 	}
 
@@ -289,7 +290,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 		/* nonblock mode is set */
 		if (C_BAUD(tty))
 			tty_port_raise_dtr_rts(port);
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 		pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
 		return 0;
 	}
@@ -318,13 +319,12 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	while (1) {
-		if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+		if (C_BAUD(tty) && tty_port_initialized(port))
 			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) ||
-		    !test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+		if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
 			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
@@ -365,7 +365,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
 		 __FILE__, __LINE__, tty->driver->name, port->count);
 
 	if (!retval)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
+		tty_port_set_active(port, 1);
 
 	return retval;
 }
@@ -876,8 +876,9 @@ static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
 	IRDA_ASSERT(self != NULL, return;);
 	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
 
-	if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags))
+	if (!tty_port_initialized(&self->port))
 		return;
+	tty_port_set_initialized(&self->port, 0);
 
 	ircomm_tty_detach_cable(self);
 
@@ -925,7 +926,6 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
 	ircomm_tty_shutdown(self);
 
 	spin_lock_irqsave(&port->lock, flags);
-	port->flags &= ~ASYNC_NORMAL_ACTIVE;
 	if (port->tty) {
 		set_bit(TTY_IO_ERROR, &port->tty->flags);
 		tty_kref_put(port->tty);
@@ -933,6 +933,7 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
 	port->tty = NULL;
 	port->count = 0;
 	spin_unlock_irqrestore(&port->lock, flags);
+	tty_port_set_active(port, 0);
 
 	wake_up_interruptible(&port->open_wait);
 }
@@ -999,7 +1000,7 @@ void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
 	if (status & IRCOMM_DCE_DELTA_ANY) {
 		/*wake_up_interruptible(&self->delta_msr_wait);*/
 	}
-	if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+	if (tty_port_check_carrier(&self->port) && (status & IRCOMM_DELTA_CD)) {
 		pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line,
 			 (status & IRCOMM_CD) ? "on" : "off");
 
@@ -1255,11 +1256,11 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_CTS_FLOW", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_CHECK_CD) {
+	if (tty_port_check_carrier(&self->port)) {
 		seq_printf(m, "%cASYNC_CHECK_CD", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(&self->port)) {
 		seq_printf(m, "%cASYNC_INITIALIZED", sep);
 		sep = '|';
 	}
@@ -1267,7 +1268,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
 		seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
 		sep = '|';
 	}
-	if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
+	if (tty_port_active(&self->port)) {
 		seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
 		sep = '|';
 	}
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index 61137f8b5293..0a411019c098 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -968,7 +968,7 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
 		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
 		ircomm_tty_start_watchdog_timer(self, 3*HZ);
 
-		if (self->port.flags & ASYNC_CHECK_CD) {
+		if (tty_port_check_carrier(&self->port)) {
 			/* Drop carrier */
 			self->settings.dce = IRCOMM_DELTA_CD;
 			ircomm_tty_check_modem_status(self);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index d3687aaa23de..d4fdf8f7b471 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -86,21 +86,17 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
 	ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
 
 	/* CTS flow control flag and modem status interrupts */
+	tty_port_set_cts_flow(&self->port, cflag & CRTSCTS);
 	if (cflag & CRTSCTS) {
-		self->port.flags |= ASYNC_CTS_FLOW;
 		self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
 		/* This got me. Bummer. Jean II */
 		if (self->service_type == IRCOMM_3_WIRE_RAW)
 			net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n",
 					     __func__);
 	} else {
-		self->port.flags &= ~ASYNC_CTS_FLOW;
 		self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
 	}
-	if (cflag & CLOCAL)
-		self->port.flags &= ~ASYNC_CHECK_CD;
-	else
-		self->port.flags |= ASYNC_CHECK_CD;
+	tty_port_set_check_carrier(&self->port, ~cflag & CLOCAL);
 #if 0
 	/*
 	 * Set up parity check flag
@@ -166,7 +162,7 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
 	/* Handle transition away from B0 status */
 	if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
 		self->settings.dte |= IRCOMM_DTR;
-		if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+		if (!C_CRTSCTS(tty) || !tty_throttled(tty))
 			self->settings.dte |= IRCOMM_RTS;
 		ircomm_param_request(self, IRCOMM_DTE, TRUE);
 	}
@@ -190,7 +186,7 @@ int ircomm_tty_tiocmget(struct tty_struct *tty)
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
 	unsigned int result;
 
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	result =  ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
@@ -213,7 +209,7 @@ int ircomm_tty_tiocmset(struct tty_struct *tty,
 {
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
 
-	if (tty->flags & (1 << TTY_IO_ERROR))
+	if (tty_io_error(tty))
 		return -EIO;
 
 	IRDA_ASSERT(self != NULL, return -1;);
@@ -328,7 +324,7 @@ static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
 
  check_and_exit:
 
-	if (self->flags & ASYNC_INITIALIZED) {
+	if (tty_port_initialized(self)) {
 		if (((old_state.flags & ASYNC_SPD_MASK) !=
 		     (self->flags & ASYNC_SPD_MASK)) ||
 		    (old_driver.custom_divisor != driver->custom_divisor)) {
@@ -362,7 +358,7 @@ int ircomm_tty_ioctl(struct tty_struct *tty,
 	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
 	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
 	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
+		if (tty_io_error(tty))
 		    return -EIO;
 	}
 
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 40662d73204f..0b68ba730a06 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1483,7 +1483,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
 	long timeo;
 	struct kcm_rx_msg *rxm;
 	int err = 0;
-	size_t copied;
+	ssize_t copied;
 	struct sk_buff *skb;
 
 	/* Only support splice for SOCKSEQPACKET */
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 6edfa9980314..1e40dacaa137 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1581,7 +1581,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
 	/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
 	tunnel->encap = encap;
 	if (encap == L2TP_ENCAPTYPE_UDP) {
-		struct udp_tunnel_sock_cfg udp_cfg;
+		struct udp_tunnel_sock_cfg udp_cfg = { };
 
 		udp_cfg.sk_user_data = tunnel;
 		udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index c6f5df1bed12..6c54e03fe9c1 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -128,6 +128,7 @@ static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
  */
 static int l2tp_ip6_recv(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb->dev);
 	struct sock *sk;
 	u32 session_id;
 	u32 tunnel_id;
@@ -154,7 +155,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
 	}
 
 	/* Ok, this is a data packet. Lookup the session. */
-	session = l2tp_session_find(&init_net, NULL, session_id);
+	session = l2tp_session_find(net, NULL, session_id);
 	if (session == NULL)
 		goto discard;
 
@@ -188,14 +189,14 @@ pass_up:
 		goto discard;
 
 	tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
-	tunnel = l2tp_tunnel_find(&init_net, tunnel_id);
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
 	if (tunnel != NULL)
 		sk = tunnel->sock;
 	else {
 		struct ipv6hdr *iph = ipv6_hdr(skb);
 
 		read_lock_bh(&l2tp_ip6_lock);
-		sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr,
+		sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
 					    0, tunnel_id);
 		read_unlock_bh(&l2tp_ip6_lock);
 	}
@@ -263,6 +264,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
+	struct net *net = sock_net(sk);
 	__be32 v4addr = 0;
 	int addr_type;
 	int err;
@@ -286,7 +288,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	err = -EADDRINUSE;
 	read_lock_bh(&l2tp_ip6_lock);
-	if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr,
+	if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
 				   sk->sk_bound_dev_if, addr->l2tp_conn_id))
 		goto out_in_use;
 	read_unlock_bh(&l2tp_ip6_lock);
@@ -456,7 +458,7 @@ static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 drop:
-	IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS);
+	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
 	kfree_skb(skb);
 	return -1;
 }
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 5dba899131b3..182470847fcf 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -444,10 +444,9 @@ static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb,
 		break;
 
 	case LAPB_FRMR:
-		lapb_dbg(1, "(%p) S3 RX FRMR(%d) %02X %02X %02X %02X %02X\n",
+		lapb_dbg(1, "(%p) S3 RX FRMR(%d) %5ph\n",
 			 lapb->dev, frame->pf,
-			 skb->data[0], skb->data[1], skb->data[2],
-			 skb->data[3], skb->data[4]);
+			 skb->data);
 		lapb_establish_data_link(lapb);
 		lapb_dbg(0, "(%p) S3 -> S1\n", lapb->dev);
 		lapb_requeue_frames(lapb);
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index ba4d015bd1a6..482c94d9d958 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -148,9 +148,7 @@ void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type)
 		}
 	}
 
-	lapb_dbg(2, "(%p) S%d TX %02X %02X %02X\n",
-		 lapb->dev, lapb->state,
-		 skb->data[0], skb->data[1], skb->data[2]);
+	lapb_dbg(2, "(%p) S%d TX %3ph\n", lapb->dev, lapb->state, skb->data);
 
 	if (!lapb_data_transmit(lapb, skb))
 		kfree_skb(skb);
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 9d0a426eccbb..3c1914df641f 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -113,9 +113,7 @@ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb,
 {
 	frame->type = LAPB_ILLEGAL;
 
-	lapb_dbg(2, "(%p) S%d RX %02X %02X %02X\n",
-		 lapb->dev, lapb->state,
-		 skb->data[0], skb->data[1], skb->data[2]);
+	lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data);
 
 	/* We always need to look at 2 bytes, sometimes we need
 	 * to look at 3 and those cases are handled below.
@@ -284,10 +282,9 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
 		dptr++;
 		*dptr++ = lapb->frmr_type;
 
-		lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X %02X %02X\n",
+		lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n",
 			 lapb->dev, lapb->state,
-			 skb->data[1], skb->data[2], skb->data[3],
-			 skb->data[4], skb->data[5]);
+			 &skb->data[1]);
 	} else {
 		dptr    = skb_put(skb, 4);
 		*dptr++ = LAPB_FRMR;
@@ -299,9 +296,8 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
 		dptr++;
 		*dptr++ = lapb->frmr_type;
 
-		lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X\n",
-			 lapb->dev, lapb->state, skb->data[1],
-			 skb->data[2], skb->data[3]);
+		lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n",
+			 lapb->dev, lapb->state, &skb->data[1]);
 	}
 
 	lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 4c6404e1ad6e..21b1fdf5d01d 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -161,6 +161,10 @@ void mesh_sta_cleanup(struct sta_info *sta)
 		del_timer_sync(&sta->mesh->plink_timer);
 	}
 
+	/* make sure no readers can access nexthop sta from here on */
+	mesh_path_flush_by_nexthop(sta);
+	synchronize_net();
+
 	if (changed)
 		ieee80211_mbss_info_change_notify(sdata, changed);
 }
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index c8b8ccc370eb..78b0ef32dddd 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -280,7 +280,7 @@ struct ieee80211_fast_tx {
 	u8 sa_offs, da_offs, pn_offs;
 	u8 band;
 	u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
-	       sizeof(rfc1042_header)];
+	       sizeof(rfc1042_header)] __aligned(2);
 
 	struct rcu_head rcu_head;
 };
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index bbcf60465e5c..2055e57ed1c3 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -26,15 +26,6 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 	netdev_features_t mpls_features;
 	__be16 mpls_protocol;
 
-	if (unlikely(skb_shinfo(skb)->gso_type &
-				~(SKB_GSO_TCPV4 |
-				  SKB_GSO_TCPV6 |
-				  SKB_GSO_UDP |
-				  SKB_GSO_DODGY |
-				  SKB_GSO_TCP_FIXEDID |
-				  SKB_GSO_TCP_ECN)))
-		goto out;
-
 	/* Setup inner SKB. */
 	mpls_protocol = skb->protocol;
 	skb->protocol = skb->inner_protocol;
@@ -57,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 	 * skb_mac_gso_segment(), an indirect caller of this function.
 	 */
 	__skb_pull(skb, skb->data - skb_mac_header(skb));
-out:
+
 	return segs;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2cb3c626cd43..096a45103f14 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -762,7 +762,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs,
  *	If available, return 1, otherwise invalidate this connection
  *	template and return 0.
  */
-int ip_vs_check_template(struct ip_vs_conn *ct)
+int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
 {
 	struct ip_vs_dest *dest = ct->dest;
 	struct netns_ipvs *ipvs = ct->ipvs;
@@ -772,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
 	 */
 	if ((dest == NULL) ||
 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
-	    expire_quiescent_template(ipvs, dest)) {
+	    expire_quiescent_template(ipvs, dest) ||
+	    (cdest && (dest != cdest))) {
 		IP_VS_DBG_BUF(9, "check_template: dest not available for "
 			      "protocol %s s:%s:%d v:%s:%d "
 			      "-> d:%s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1207f20d24e4..2c1b498a7a27 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -321,7 +321,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 
 	/* Check if a template already exists */
 	ct = ip_vs_ct_in_get(&param);
-	if (!ct || !ip_vs_check_template(ct)) {
+	if (!ct || !ip_vs_check_template(ct, NULL)) {
 		struct ip_vs_scheduler *sched;
 
 		/*
@@ -1154,7 +1154,8 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
 						  vport, &param) < 0)
 			return NULL;
 		ct = ip_vs_ct_in_get(&param);
-		if (!ct) {
+		/* check if template exists and points to the same dest */
+		if (!ct || !ip_vs_check_template(ct, dest)) {
 			ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
 					    IP_VS_CONN_F_TEMPLATE, dest, 0);
 			if (!ct) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6d19d2eeaa60..01d3d894de46 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -932,17 +932,14 @@ error:
 
 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
 {
-	if (encaps_af == AF_INET) {
-		if (orig_af == AF_INET)
-			return SKB_GSO_IPIP;
-
-		return SKB_GSO_SIT;
+	switch (encaps_af) {
+	case AF_INET:
+		return SKB_GSO_IPXIP4;
+	case AF_INET6:
+		return SKB_GSO_IPXIP6;
+	default:
+		return 0;
 	}
-
-	/* GSO: we need to provide proper SKB_GSO_ value for IPv6:
-	 * SKB_GSO_SIT/IPV6
-	 */
-	return 0;
 }
 
 /*
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 883c691ec8d0..19efeba02abb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -632,6 +632,7 @@ static int __init nf_conntrack_ftp_init(void)
 			if (ret) {
 				pr_err("failed to register helper for pf: %d port: %d\n",
 				       ftp[i][j].tuple.src.l3num, ports[i]);
+				ports_c = i;
 				nf_conntrack_ftp_fini();
 				return ret;
 			}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index f703adb7e5f7..196cb39649e1 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -361,9 +361,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_log);
 
 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 {
-	int ret = 0;
-	struct nf_conntrack_helper *cur;
+	struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
 	unsigned int h = helper_hash(&me->tuple);
+	struct nf_conntrack_helper *cur;
+	int ret = 0;
 
 	BUG_ON(me->expect_policy == NULL);
 	BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
@@ -371,9 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 
 	mutex_lock(&nf_ct_helper_mutex);
 	hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
-		if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
-		    cur->tuple.src.l3num == me->tuple.src.l3num &&
-		    cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+		if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) {
 			ret = -EEXIST;
 			goto out;
 		}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 8b6da2719600..f97ac61d2536 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -271,6 +271,7 @@ static int __init nf_conntrack_irc_init(void)
 		if (ret) {
 			pr_err("failed to register helper for pf: %u port: %u\n",
 			       irc[i].tuple.src.l3num, ports[i]);
+			ports_c = i;
 			nf_conntrack_irc_fini();
 			return ret;
 		}
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 7523a575f6d1..3fcbaab83b3d 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -223,6 +223,7 @@ static int __init nf_conntrack_sane_init(void)
 			if (ret) {
 				pr_err("failed to register helper for pf: %d port: %d\n",
 				       sane[i][j].tuple.src.l3num, ports[i]);
+				ports_c = i;
 				nf_conntrack_sane_fini();
 				return ret;
 			}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 3e06402739e0..f72ba5587588 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1669,6 +1669,7 @@ static int __init nf_conntrack_sip_init(void)
 			if (ret) {
 				pr_err("failed to register helper for pf: %u port: %u\n",
 				       sip[i][j].tuple.src.l3num, ports[i]);
+				ports_c = i;
 				nf_conntrack_sip_fini();
 				return ret;
 			}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f87e84ebcec3..c026c472ea80 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -487,8 +487,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 	{ }
 };
 
-#define NET_NF_CONNTRACK_MAX 2089
-
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 36f964066461..2e65b5430fba 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -142,6 +142,7 @@ static int __init nf_conntrack_tftp_init(void)
 			if (ret) {
 				pr_err("failed to register helper for pf: %u port: %u\n",
 				       tftp[i][j].tuple.src.l3num, ports[i]);
+				ports_c = i;
 				nf_conntrack_tftp_fini();
 				return ret;
 			}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5baa8e24e6ac..b19ad20a705c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -26,23 +26,21 @@
  * Once the queue is registered it must reinject all packets it
  * receives, no matter what.
  */
-static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
 
 /* return EBUSY when somebody else is registered, return EEXIST if the
  * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(const struct nf_queue_handler *qh)
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
 {
 	/* should never happen, we only have one queueing backend in kernel */
-	WARN_ON(rcu_access_pointer(queue_handler));
-	rcu_assign_pointer(queue_handler, qh);
+	WARN_ON(rcu_access_pointer(net->nf.queue_handler));
+	rcu_assign_pointer(net->nf.queue_handler, qh);
 }
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-void nf_unregister_queue_handler(void)
+void nf_unregister_queue_handler(struct net *net)
 {
-	RCU_INIT_POINTER(queue_handler, NULL);
-	synchronize_rcu();
+	RCU_INIT_POINTER(net->nf.queue_handler, NULL);
 }
 EXPORT_SYMBOL(nf_unregister_queue_handler);
 
@@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
 	const struct nf_queue_handler *qh;
 
 	rcu_read_lock();
-	qh = rcu_dereference(queue_handler);
+	qh = rcu_dereference(net->nf.queue_handler);
 	if (qh)
 		qh->nf_hook_drop(net, ops);
 	rcu_read_unlock();
@@ -122,9 +120,10 @@ int nf_queue(struct sk_buff *skb,
 	struct nf_queue_entry *entry = NULL;
 	const struct nf_afinfo *afinfo;
 	const struct nf_queue_handler *qh;
+	struct net *net = state->net;
 
 	/* QUEUE == DROP if no one is waiting, to be safe. */
-	qh = rcu_dereference(queue_handler);
+	qh = rcu_dereference(net->nf.queue_handler);
 	if (!qh) {
 		status = -ESRCH;
 		goto err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4d292b933b5c..7b7aa871a174 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2647,6 +2647,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	/* Only accept unspec with dump */
 	if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
 		return -EAFNOSUPPORT;
+	if (!nla[NFTA_SET_TABLE])
+		return -EINVAL;
 
 	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
 	if (IS_ERR(set))
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index aa93877ab6e2..5d36a0926b4a 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -557,7 +557,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 
 	if (entskb->tstamp.tv64) {
 		struct nfqnl_msg_packet_timestamp ts;
-		struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+		struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
 
 		ts.sec = cpu_to_be64(kts.tv_sec);
 		ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -1482,21 +1482,29 @@ static int __net_init nfnl_queue_net_init(struct net *net)
 			 net->nf.proc_netfilter, &nfqnl_file_ops))
 		return -ENOMEM;
 #endif
+	nf_register_queue_handler(net, &nfqh);
 	return 0;
 }
 
 static void __net_exit nfnl_queue_net_exit(struct net *net)
 {
+	nf_unregister_queue_handler(net);
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
 #endif
 }
 
+static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
+{
+	synchronize_rcu();
+}
+
 static struct pernet_operations nfnl_queue_net_ops = {
-	.init	= nfnl_queue_net_init,
-	.exit	= nfnl_queue_net_exit,
-	.id	= &nfnl_queue_net_id,
-	.size	= sizeof(struct nfnl_queue_net),
+	.init		= nfnl_queue_net_init,
+	.exit		= nfnl_queue_net_exit,
+	.exit_batch	= nfnl_queue_net_exit_batch,
+	.id		= &nfnl_queue_net_id,
+	.size		= sizeof(struct nfnl_queue_net),
 };
 
 static int __init nfnetlink_queue_init(void)
@@ -1517,7 +1525,6 @@ static int __init nfnetlink_queue_init(void)
 	}
 
 	register_netdevice_notifier(&nfqnl_dev_notifier);
-	nf_register_queue_handler(&nfqh);
 	return status;
 
 cleanup_netlink_notifier:
@@ -1529,7 +1536,6 @@ out:
 
 static void __exit nfnetlink_queue_fini(void)
 {
-	nf_unregister_queue_handler();
 	unregister_netdevice_notifier(&nfqnl_dev_notifier);
 	nfnetlink_subsys_unregister(&nfqnl_subsys);
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c69c892231d7..2675d580c490 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -612,7 +612,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 		return -EINVAL;
 
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
-	    target_offset + sizeof(struct compat_xt_standard_target) != next_offset)
+	    COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
 		return -EINVAL;
 
 	/* compat_xt_entry match has less strict aligment requirements,
@@ -694,7 +694,7 @@ int xt_check_entry_offsets(const void *base,
 		return -EINVAL;
 
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
-	    target_offset + sizeof(struct xt_standard_target) != next_offset)
+	    XT_ALIGN(target_offset + sizeof(struct xt_standard_target)) != next_offset)
 		return -EINVAL;
 
 	return xt_check_entry_match(elems, base + target_offset,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 28cddc85b700..1325776daa27 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -677,7 +677,7 @@ int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
 	u32 spot = start;
 
 	while (rc == 0 && spot <= end) {
-		if (((spot & (BITS_PER_LONG - 1)) != 0) &&
+		if (((spot & (BITS_PER_LONG - 1)) == 0) &&
 		    ((end - spot) > BITS_PER_LONG)) {
 			rc = netlbl_catmap_setlong(catmap,
 						   spot,
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 879185fe183f..9a3eb7a0ebf4 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -137,11 +137,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
 	return !!key->eth.type;
 }
 
+static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
+			     __be16 ethertype)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be16 diff[] = { ~(hdr->h_proto), ethertype };
+
+		skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+					~skb->csum);
+	}
+
+	hdr->h_proto = ethertype;
+}
+
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
 	__be32 *new_mpls_lse;
-	struct ethhdr *hdr;
 
 	/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
 	if (skb->encapsulation)
@@ -160,9 +172,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 
 	skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
 
-	hdr = eth_hdr(skb);
-	hdr->h_proto = mpls->mpls_ethertype;
-
+	update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
 	if (!skb->inner_protocol)
 		skb_set_inner_protocol(skb, skb->protocol);
 	skb->protocol = mpls->mpls_ethertype;
@@ -193,7 +203,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 	 * field correctly in the presence of VLAN tags.
 	 */
 	hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
-	hdr->h_proto = ethertype;
+	update_ethertype(skb, hdr, ethertype);
 	if (eth_p_mpls(skb->protocol))
 		skb->protocol = ethertype;
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4040eb92d9c9..9bff6ef16fa7 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -93,6 +93,7 @@
 #include <net/inet_common.h>
 #endif
 #include <linux/bpf.h>
+#include <net/compat.h>
 
 #include "internal.h"
 
@@ -3940,6 +3941,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 }
 
 
+#ifdef CONFIG_COMPAT
+static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
+				    char __user *optval, unsigned int optlen)
+{
+	struct packet_sock *po = pkt_sk(sock->sk);
+
+	if (level != SOL_PACKET)
+		return -ENOPROTOOPT;
+
+	if (optname == PACKET_FANOUT_DATA &&
+	    po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
+		optval = (char __user *)get_compat_bpf_fprog(optval);
+		if (!optval)
+			return -EFAULT;
+		optlen = sizeof(struct sock_fprog);
+	}
+
+	return packet_setsockopt(sock, level, optname, optval, optlen);
+}
+#endif
+
 static int packet_notifier(struct notifier_block *this,
 			   unsigned long msg, void *ptr)
 {
@@ -4416,6 +4438,9 @@ static const struct proto_ops packet_ops = {
 	.shutdown =	sock_no_shutdown,
 	.setsockopt =	packet_setsockopt,
 	.getsockopt =	packet_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_packet_setsockopt,
+#endif
 	.sendmsg =	packet_sendmsg,
 	.recvmsg =	packet_recvmsg,
 	.mmap =		packet_mmap,
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 93ff038ea9d1..d921adc62765 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -111,7 +111,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 		cpu_relax();
 	}
 
-	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
 	if (unlikely(ret != ibmr->sg_len))
 		return ret < 0 ? ret : -EINVAL;
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 80256b08eac0..387df5f32e49 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -74,6 +74,7 @@ enum {
 	RDS_CONN_CONNECTING,
 	RDS_CONN_DISCONNECTING,
 	RDS_CONN_UP,
+	RDS_CONN_RESETTING,
 	RDS_CONN_ERROR,
 };
 
@@ -813,6 +814,7 @@ void rds_connect_worker(struct work_struct *);
 void rds_shutdown_worker(struct work_struct *);
 void rds_send_worker(struct work_struct *);
 void rds_recv_worker(struct work_struct *);
+void rds_connect_path_complete(struct rds_connection *conn, int curr);
 void rds_connect_complete(struct rds_connection *conn);
 
 /* transport.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c0be1ecd11c9..8413f6c99e13 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -561,5 +561,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 		minfo.fport = inc->i_hdr.h_dport;
 	}
 
+	minfo.flags = 0;
+
 	rds_info_copy(iter, &minfo, sizeof(minfo));
 }
diff --git a/net/rds/send.c b/net/rds/send.c
index c9cdb358ea88..b1962f8e30f7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -99,6 +99,7 @@ void rds_send_reset(struct rds_connection *conn)
 	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
 	spin_unlock_irqrestore(&conn->c_lock, flags);
 }
+EXPORT_SYMBOL_GPL(rds_send_reset);
 
 static int acquire_in_xmit(struct rds_connection *conn)
 {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 86187dad1440..74ee126a6fe6 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -126,9 +126,81 @@ void rds_tcp_restore_callbacks(struct socket *sock,
 }
 
 /*
- * This is the only path that sets tc->t_sock.  Send and receive trust that
- * it is set.  The RDS_CONN_UP bit protects those paths from being
- * called while it isn't set.
+ * rds_tcp_reset_callbacks() switches the to the new sock and
+ * returns the existing tc->t_sock.
+ *
+ * The only functions that set tc->t_sock are rds_tcp_set_callbacks
+ * and rds_tcp_reset_callbacks.  Send and receive trust that
+ * it is set.  The absence of RDS_CONN_UP bit protects those paths
+ * from being called while it isn't set.
+ */
+void rds_tcp_reset_callbacks(struct socket *sock,
+			     struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *osock = tc->t_sock;
+
+	if (!osock)
+		goto newsock;
+
+	/* Need to resolve a duelling SYN between peers.
+	 * We have an outstanding SYN to this peer, which may
+	 * potentially have transitioned to the RDS_CONN_UP state,
+	 * so we must quiesce any send threads before resetting
+	 * c_transport_data. We quiesce these threads by setting
+	 * c_state to something other than RDS_CONN_UP, and then
+	 * waiting for any existing threads in rds_send_xmit to
+	 * complete release_in_xmit(). (Subsequent threads entering
+	 * rds_send_xmit() will bail on !rds_conn_up().
+	 *
+	 * However an incoming syn-ack at this point would end up
+	 * marking the conn as RDS_CONN_UP, and would again permit
+	 * rds_send_xmi() threads through, so ideally we would
+	 * synchronize on RDS_CONN_UP after lock_sock(), but cannot
+	 * do that: waiting on !RDS_IN_XMIT after lock_sock() may
+	 * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
+	 * would not get set. As a result, we set c_state to
+	 * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
+	 * cannot mark rds_conn_path_up() in the window before lock_sock()
+	 */
+	atomic_set(&conn->c_state, RDS_CONN_RESETTING);
+	wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags));
+	lock_sock(osock->sk);
+	/* reset receive side state for rds_tcp_data_recv() for osock  */
+	if (tc->t_tinc) {
+		rds_inc_put(&tc->t_tinc->ti_inc);
+		tc->t_tinc = NULL;
+	}
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+	tc->t_sock = NULL;
+
+	write_lock_bh(&osock->sk->sk_callback_lock);
+
+	osock->sk->sk_user_data = NULL;
+	osock->sk->sk_data_ready = tc->t_orig_data_ready;
+	osock->sk->sk_write_space = tc->t_orig_write_space;
+	osock->sk->sk_state_change = tc->t_orig_state_change;
+	write_unlock_bh(&osock->sk->sk_callback_lock);
+	release_sock(osock->sk);
+	sock_release(osock);
+newsock:
+	rds_send_reset(conn);
+	lock_sock(sock->sk);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	tc->t_sock = sock;
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = rds_tcp_data_ready;
+	sock->sk->sk_write_space = rds_tcp_write_space;
+	sock->sk->sk_state_change = rds_tcp_state_change;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+	release_sock(sock->sk);
+}
+
+/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
+ * above rds_tcp_reset_callbacks for notes about synchronization
+ * with data path
  */
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
 {
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 41c228300525..ec0602b0dc24 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,6 +50,7 @@ struct rds_tcp_statistics {
 void rds_tcp_tune(struct socket *sock);
 void rds_tcp_nonagle(struct socket *sock);
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn);
 void rds_tcp_restore_callbacks(struct socket *sock,
 			       struct rds_tcp_connection *tc);
 u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 49a3fcfed360..fba13d0305fb 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk)
 	struct rds_connection *conn;
 	struct rds_tcp_connection *tc;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
 	if (!conn) {
 		state_change = sk->sk_state_change;
@@ -60,7 +60,7 @@ void rds_tcp_state_change(struct sock *sk)
 		case TCP_SYN_RECV:
 			break;
 		case TCP_ESTABLISHED:
-			rds_connect_complete(conn);
+			rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
 			break;
 		case TCP_CLOSE_WAIT:
 		case TCP_CLOSE:
@@ -69,7 +69,7 @@ void rds_tcp_state_change(struct sock *sk)
 			break;
 	}
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	state_change(sk);
 }
 
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index be263cdf268b..686b1d03a558 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -78,7 +78,9 @@ int rds_tcp_accept_one(struct socket *sock)
 	struct inet_sock *inet;
 	struct rds_tcp_connection *rs_tcp = NULL;
 	int conn_state;
-	struct sock *nsk;
+
+	if (!sock) /* module unload or netns delete in progress */
+		return -ENETUNREACH;
 
 	ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
 			       sock->sk->sk_type, sock->sk->sk_protocol,
@@ -129,28 +131,25 @@ int rds_tcp_accept_one(struct socket *sock)
 		 * so we must quiesce any send threads before resetting
 		 * c_transport_data.
 		 */
-		wait_event(conn->c_waitq,
-			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
-		if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+		if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) ||
+		    !conn->c_outgoing) {
 			goto rst_nsk;
-		} else if (rs_tcp->t_sock) {
-			rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+		} else {
+			rds_tcp_reset_callbacks(new_sock, conn);
 			conn->c_outgoing = 0;
+			/* rds_connect_path_complete() marks RDS_CONN_UP */
+			rds_connect_path_complete(conn, RDS_CONN_DISCONNECTING);
 		}
+	} else {
+		rds_tcp_set_callbacks(new_sock, conn);
+		rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
 	}
-	rds_tcp_set_callbacks(new_sock, conn);
-	rds_connect_complete(conn); /* marks RDS_CONN_UP */
 	new_sock = NULL;
 	ret = 0;
 	goto out;
 rst_nsk:
 	/* reset the newly returned accept sock and bail */
-	nsk = new_sock->sk;
-	rds_tcp_stats_inc(s_tcp_listen_closed_stale);
-	nsk->sk_user_data = NULL;
-	nsk->sk_prot->disconnect(nsk, 0);
-	tcp_done(nsk);
-	new_sock = NULL;
+	kernel_sock_shutdown(new_sock, SHUT_RDWR);
 	ret = 0;
 out:
 	if (rs_tcp)
@@ -166,7 +165,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
 
 	rdsdebug("listen data ready sk %p\n", sk);
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	ready = sk->sk_user_data;
 	if (!ready) { /* check for teardown race */
 		ready = sk->sk_data_ready;
@@ -183,7 +182,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
 		rds_tcp_accept_work(sk);
 
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	ready(sk);
 }
 
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index d75d8b56a9e3..c3196f9d070a 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -301,7 +301,7 @@ void rds_tcp_data_ready(struct sock *sk)
 
 	rdsdebug("data ready sk %p\n", sk);
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
 	if (!conn) { /* check for teardown race */
 		ready = sk->sk_data_ready;
@@ -315,7 +315,7 @@ void rds_tcp_data_ready(struct sock *sk)
 	if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
 		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	ready(sk);
 }
 
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 2894e6095e3b..22d0f2020a79 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -180,7 +180,7 @@ void rds_tcp_write_space(struct sock *sk)
 	struct rds_connection *conn;
 	struct rds_tcp_connection *tc;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
 	if (!conn) {
 		write_space = sk->sk_write_space;
@@ -200,7 +200,7 @@ void rds_tcp_write_space(struct sock *sk)
 		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 
 	/*
 	 * write_space is only called when data leaves tcp's send queue if
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 454aa6d23327..4a323045719b 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -71,9 +71,9 @@
 struct workqueue_struct *rds_wq;
 EXPORT_SYMBOL_GPL(rds_wq);
 
-void rds_connect_complete(struct rds_connection *conn)
+void rds_connect_path_complete(struct rds_connection *conn, int curr)
 {
-	if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+	if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) {
 		printk(KERN_WARNING "%s: Cannot transition to state UP, "
 				"current state is %d\n",
 				__func__,
@@ -90,6 +90,12 @@ void rds_connect_complete(struct rds_connection *conn)
 	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 	queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
 }
+EXPORT_SYMBOL_GPL(rds_connect_path_complete);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+	rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+}
 EXPORT_SYMBOL_GPL(rds_connect_complete);
 
 /*
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 3fb492eedeb9..1021b4c0bdd2 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -965,7 +965,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 
 	key = key_alloc(&key_type_rxrpc, "x",
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
-			KEY_ALLOC_NOT_IN_QUOTA);
+			KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
 		return -ENOMEM;
@@ -1012,7 +1012,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
 
 	key = key_alloc(&key_type_rxrpc, keyname,
 			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
-			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
 	if (IS_ERR(key))
 		return key;
 
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 6b726a046a7d..bab56ed649ba 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -1162,9 +1162,7 @@ static int rxkad_init(void)
 	/* pin the cipher we need so that the crypto layer doesn't invoke
 	 * keventd to go get it */
 	rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(rxkad_ci))
-		return PTR_ERR(rxkad_ci);
-	return 0;
+	return PTR_ERR_OR_ZERO(rxkad_ci);
 }
 
 /*
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 330f14e302e8..c557789765dc 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -38,7 +38,7 @@ struct tcf_police {
 	bool			peak_present;
 };
 #define to_police(pc)	\
-	container_of(pc, struct tcf_police, common)
+	container_of(pc->priv, struct tcf_police, common)
 
 #define POL_TAB_MASK     15
 
@@ -119,14 +119,12 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 				 struct nlattr *est, struct tc_action *a,
 				 int ovr, int bind)
 {
-	unsigned int h;
 	int ret = 0, err;
 	struct nlattr *tb[TCA_POLICE_MAX + 1];
 	struct tc_police *parm;
 	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
 	struct tc_action_net *tn = net_generic(net, police_net_id);
-	struct tcf_hashinfo *hinfo = tn->hinfo;
 	int size;
 
 	if (nla == NULL)
@@ -145,7 +143,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 
 	if (parm->index) {
 		if (tcf_hash_search(tn, a, parm->index)) {
-			police = to_police(a->priv);
+			police = to_police(a);
 			if (bind) {
 				police->tcf_bindcnt += 1;
 				police->tcf_refcnt += 1;
@@ -156,16 +154,15 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
 			/* not replacing */
 			return -EEXIST;
 		}
+	} else {
+		ret = tcf_hash_create(tn, parm->index, NULL, a,
+				      sizeof(*police), bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
 	}
 
-	police = kzalloc(sizeof(*police), GFP_KERNEL);
-	if (police == NULL)
-		return -ENOMEM;
-	ret = ACT_P_CREATED;
-	police->tcf_refcnt = 1;
-	spin_lock_init(&police->tcf_lock);
-	if (bind)
-		police->tcf_bindcnt = 1;
+	police = to_police(a);
 override:
 	if (parm->rate.rate) {
 		err = -ENOMEM;
@@ -237,14 +234,8 @@ override:
 		return ret;
 
 	police->tcfp_t_c = ktime_get_ns();
-	police->tcf_index = parm->index ? parm->index :
-		tcf_hash_new_index(tn);
-	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
-	spin_lock_bh(&hinfo->lock);
-	hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
-	spin_unlock_bh(&hinfo->lock);
+	tcf_hash_insert(tn, a);
 
-	a->priv = police;
 	return ret;
 
 failure_unlock:
@@ -253,7 +244,7 @@ failure:
 	qdisc_put_rtab(P_tab);
 	qdisc_put_rtab(R_tab);
 	if (ret == ACT_P_CREATED)
-		kfree(police);
+		tcf_hash_cleanup(a, est);
 	return err;
 }
 
@@ -268,6 +259,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 	spin_lock(&police->tcf_lock);
 
 	bstats_update(&police->tcf_bstats, skb);
+	tcf_lastuse_update(&police->tcf_tm);
 
 	if (police->tcfp_ewma_rate &&
 	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
@@ -327,6 +319,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 		.refcnt = police->tcf_refcnt - ref,
 		.bindcnt = police->tcf_bindcnt - bind,
 	};
+	struct tcf_t t;
 
 	if (police->rate_present)
 		psched_ratecfg_getrate(&opt.rate, &police->rate);
@@ -340,6 +333,13 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 	if (police->tcfp_ewma_rate &&
 	    nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
 		goto nla_put_failure;
+
+	t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+	if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
+		goto nla_put_failure;
+
 	return skb->len;
 
 nla_put_failure:
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 730aacafc22d..b3b7978f4182 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -171,7 +171,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
 	struct tc_cls_flower_offload offload = {0};
 	struct tc_to_netdev tc;
 
-	if (!tc_should_offload(dev, 0))
+	if (!tc_should_offload(dev, tp, 0))
 		return;
 
 	offload.command = TC_CLSFLOWER_DESTROY;
@@ -194,7 +194,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
 	struct tc_cls_flower_offload offload = {0};
 	struct tc_to_netdev tc;
 
-	if (!tc_should_offload(dev, flags))
+	if (!tc_should_offload(dev, tp, flags))
 		return;
 
 	offload.command = TC_CLSFLOWER_REPLACE;
@@ -216,7 +216,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload offload = {0};
 	struct tc_to_netdev tc;
 
-	if (!tc_should_offload(dev, 0))
+	if (!tc_should_offload(dev, tp, 0))
 		return;
 
 	offload.command = TC_CLSFLOWER_STATS;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 079b43b3c5d2..ffe593efe930 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -440,7 +440,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev, 0)) {
+	if (tc_should_offload(dev, tp, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -457,20 +457,21 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp,
 	struct tc_to_netdev offload;
 	int err;
 
+	if (!tc_should_offload(dev, tp, flags))
+		return tc_skip_sw(flags) ? -EINVAL : 0;
+
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev, flags)) {
-		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
-		offload.cls_u32->hnode.divisor = h->divisor;
-		offload.cls_u32->hnode.handle = h->handle;
-		offload.cls_u32->hnode.prio = h->prio;
+	offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+	offload.cls_u32->hnode.divisor = h->divisor;
+	offload.cls_u32->hnode.handle = h->handle;
+	offload.cls_u32->hnode.prio = h->prio;
 
-		err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-						    tp->protocol, &offload);
-		if (tc_skip_sw(flags))
-			return err;
-	}
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					    tp->protocol, &offload);
+	if (tc_skip_sw(flags))
+		return err;
 
 	return 0;
 }
@@ -484,7 +485,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev, 0)) {
+	if (tc_should_offload(dev, tp, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -507,27 +508,28 @@ static int u32_replace_hw_knode(struct tcf_proto *tp,
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev, flags)) {
-		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
-		offload.cls_u32->knode.handle = n->handle;
-		offload.cls_u32->knode.fshift = n->fshift;
+	if (!tc_should_offload(dev, tp, flags))
+		return tc_skip_sw(flags) ? -EINVAL : 0;
+
+	offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+	offload.cls_u32->knode.handle = n->handle;
+	offload.cls_u32->knode.fshift = n->fshift;
 #ifdef CONFIG_CLS_U32_MARK
-		offload.cls_u32->knode.val = n->val;
-		offload.cls_u32->knode.mask = n->mask;
+	offload.cls_u32->knode.val = n->val;
+	offload.cls_u32->knode.mask = n->mask;
 #else
-		offload.cls_u32->knode.val = 0;
-		offload.cls_u32->knode.mask = 0;
+	offload.cls_u32->knode.val = 0;
+	offload.cls_u32->knode.mask = 0;
 #endif
-		offload.cls_u32->knode.sel = &n->sel;
-		offload.cls_u32->knode.exts = &n->exts;
-		if (n->ht_down)
-			offload.cls_u32->knode.link_handle = n->ht_down->handle;
-
-		err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-						    tp->protocol, &offload);
-		if (tc_skip_sw(flags))
-			return err;
-	}
+	offload.cls_u32->knode.sel = &n->sel;
+	offload.cls_u32->knode.exts = &n->exts;
+	if (n->ht_down)
+		offload.cls_u32->knode.link_handle = n->ht_down->handle;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+					    tp->protocol, &offload);
+	if (tc_skip_sw(flags))
+		return err;
 
 	return 0;
 }
@@ -863,7 +865,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (tb[TCA_U32_FLAGS]) {
 		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
 		if (!tc_flags_valid(flags))
-			return err;
+			return -EINVAL;
 	}
 
 	n = (struct tc_u_knode *)*arg;
@@ -921,11 +923,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		ht->divisor = divisor;
 		ht->handle = handle;
 		ht->prio = tp->prio;
+
+		err = u32_replace_hw_hnode(tp, ht, flags);
+		if (err) {
+			kfree(ht);
+			return err;
+		}
+
 		RCU_INIT_POINTER(ht->next, tp_c->hlist);
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
 
-		u32_replace_hw_hnode(tp, ht, flags);
 		return 0;
 	}
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 64f71a2155f3..ddf047df5361 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -607,6 +607,10 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool thr
 	if (throttle)
 		qdisc_throttled(wd->qdisc);
 
+	if (wd->last_expires == expires)
+		return;
+
+	wd->last_expires = expires;
 	hrtimer_start(&wd->timer,
 		      ns_to_ktime(expires),
 		      HRTIMER_MODE_ABS_PINNED);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a63e879e8975..bf8af2c43c2c 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -375,6 +375,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		cl->deficit = cl->quantum;
 	}
 
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 	return err;
 }
@@ -407,6 +408,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
 
 			bstats_update(&cl->bstats, skb);
 			qdisc_bstats_update(sch, skb);
+			qdisc_qstats_backlog_dec(sch, skb);
 			sch->q.qlen--;
 			return skb;
 		}
@@ -428,6 +430,7 @@ static unsigned int drr_drop(struct Qdisc *sch)
 		if (cl->qdisc->ops->drop) {
 			len = cl->qdisc->ops->drop(cl->qdisc);
 			if (len > 0) {
+				sch->qstats.backlog -= len;
 				sch->q.qlen--;
 				if (cl->qdisc->q.qlen == 0)
 					list_del(&cl->alist);
@@ -463,6 +466,7 @@ static void drr_reset_qdisc(struct Qdisc *sch)
 			qdisc_reset(cl->qdisc);
 		}
 	}
+	sch->qstats.backlog = 0;
 	sch->q.qlen = 0;
 }
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6883a8971562..da250b2e06ae 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -199,6 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	unsigned int idx, prev_backlog, prev_qlen;
 	struct fq_codel_flow *flow;
 	int uninitialized_var(ret);
+	unsigned int pkt_len;
 	bool memory_limited;
 
 	idx = fq_codel_classify(skb, sch, &ret);
@@ -230,6 +231,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	prev_backlog = sch->qstats.backlog;
 	prev_qlen = sch->q.qlen;
 
+	/* save this packet length as it might be dropped by fq_codel_drop() */
+	pkt_len = qdisc_pkt_len(skb);
 	/* fq_codel_drop() is quite expensive, as it performs a linear search
 	 * in q->backlogs[] to find a fat flow.
 	 * So instead of dropping a single packet, drop half of its backlog
@@ -237,14 +240,23 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 */
 	ret = fq_codel_drop(sch, q->drop_batch_size);
 
-	q->drop_overlimit += prev_qlen - sch->q.qlen;
+	prev_qlen -= sch->q.qlen;
+	prev_backlog -= sch->qstats.backlog;
+	q->drop_overlimit += prev_qlen;
 	if (memory_limited)
-		q->drop_overmemory += prev_qlen - sch->q.qlen;
-	/* As we dropped packet(s), better let upper stack know this */
-	qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
-				  prev_backlog - sch->qstats.backlog);
+		q->drop_overmemory += prev_qlen;
 
-	return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
+	/* As we dropped packet(s), better let upper stack know this.
+	 * If we dropped a packet for this flow, return NET_XMIT_CN,
+	 * but in this case, our parents wont increase their backlogs.
+	 */
+	if (ret == idx) {
+		qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
+					  prev_backlog - pkt_len);
+		return NET_XMIT_CN;
+	}
+	qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
+	return NET_XMIT_SUCCESS;
 }
 
 /* This is the specific function called from codel_dequeue()
@@ -649,7 +661,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 		qs.backlog = q->backlogs[idx];
 		qs.drops = flow->dropped;
 	}
-	if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
+	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 		return -1;
 	if (idx < q->flows_cnt)
 		return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 269dd71b3828..f9e0e9c03d0a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -49,6 +49,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	q->gso_skb = skb;
 	q->qstats.requeues++;
+	qdisc_qstats_backlog_inc(q, skb);
 	q->q.qlen++;	/* it's still part of the queue */
 	__netif_schedule(q);
 
@@ -92,6 +93,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 		txq = skb_get_tx_queue(txq->dev, skb);
 		if (!netif_xmit_frozen_or_stopped(txq)) {
 			q->gso_skb = NULL;
+			qdisc_qstats_backlog_dec(q, skb);
 			q->q.qlen--;
 		} else
 			skb = NULL;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index d783d7cc3348..1ac9f9f03fe3 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1529,6 +1529,7 @@ hfsc_reset_qdisc(struct Qdisc *sch)
 	q->eligible = RB_ROOT;
 	INIT_LIST_HEAD(&q->droplist);
 	qdisc_watchdog_cancel(&q->watchdog);
+	sch->qstats.backlog = 0;
 	sch->q.qlen = 0;
 }
 
@@ -1559,14 +1560,6 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
 	struct hfsc_sched *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_hfsc_qopt qopt;
-	struct hfsc_class *cl;
-	unsigned int i;
-
-	sch->qstats.backlog = 0;
-	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
-			sch->qstats.backlog += cl->qdisc->qstats.backlog;
-	}
 
 	qopt.defcls = q->defcls;
 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
@@ -1604,6 +1597,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl->qdisc->q.qlen == 1)
 		set_active(cl, qdisc_pkt_len(skb));
 
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -1672,6 +1666,7 @@ hfsc_dequeue(struct Qdisc *sch)
 
 	qdisc_unthrottled(sch);
 	qdisc_bstats_update(sch, skb);
+	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 
 	return skb;
@@ -1695,6 +1690,7 @@ hfsc_drop(struct Qdisc *sch)
 			}
 			cl->qstats.drops++;
 			qdisc_qstats_drop(sch);
+			sch->qstats.backlog -= len;
 			sch->q.qlen--;
 			return len;
 		}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index f6bf5818ed4d..d4b4218af6b1 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -928,17 +928,10 @@ ok:
 		}
 	}
 	qdisc_qstats_overlimit(sch);
-	if (likely(next_event > q->now)) {
-		if (!test_bit(__QDISC_STATE_DEACTIVATED,
-			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
-			ktime_t time = ns_to_ktime(next_event);
-			qdisc_throttled(q->watchdog.qdisc);
-			hrtimer_start(&q->watchdog.timer, time,
-				      HRTIMER_MODE_ABS_PINNED);
-		}
-	} else {
+	if (likely(next_event > q->now))
+		qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true);
+	else
 		schedule_work(&q->work);
-	}
 fin:
 	return skb;
 }
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 10adbc617905..8fe6999b642a 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -27,6 +27,11 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
 	return TC_H_MIN(classid) + 1;
 }
 
+static bool ingress_cl_offload(u32 classid)
+{
+	return true;
+}
+
 static unsigned long ingress_bind_filter(struct Qdisc *sch,
 					 unsigned long parent, u32 classid)
 {
@@ -86,6 +91,7 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 	.put		=	ingress_put,
 	.walk		=	ingress_walk,
 	.tcf_chain	=	ingress_find_tcf,
+	.tcf_cl_offload	=	ingress_cl_offload,
 	.bind_tcf	=	ingress_bind_filter,
 	.unbind_tcf	=	ingress_put,
 };
@@ -110,6 +116,11 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
 	}
 }
 
+static bool clsact_cl_offload(u32 classid)
+{
+	return TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS);
+}
+
 static unsigned long clsact_bind_filter(struct Qdisc *sch,
 					unsigned long parent, u32 classid)
 {
@@ -158,6 +169,7 @@ static const struct Qdisc_class_ops clsact_class_ops = {
 	.put		=	ingress_put,
 	.walk		=	ingress_walk,
 	.tcf_chain	=	clsact_find_tcf,
+	.tcf_cl_offload	=	clsact_cl_offload,
 	.bind_tcf	=	clsact_bind_filter,
 	.unbind_tcf	=	ingress_put,
 };
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fee1b15506b2..4b0a82191bc4 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -85,6 +85,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	ret = qdisc_enqueue(skb, qdisc);
 	if (ret == NET_XMIT_SUCCESS) {
+		qdisc_qstats_backlog_inc(sch, skb);
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
@@ -117,6 +118,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
 		struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
 		if (skb) {
 			qdisc_bstats_update(sch, skb);
+			qdisc_qstats_backlog_dec(sch, skb);
 			sch->q.qlen--;
 			return skb;
 		}
@@ -135,6 +137,7 @@ static unsigned int prio_drop(struct Qdisc *sch)
 	for (prio = q->bands-1; prio >= 0; prio--) {
 		qdisc = q->queues[prio];
 		if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+			sch->qstats.backlog -= len;
 			sch->q.qlen--;
 			return len;
 		}
@@ -151,6 +154,7 @@ prio_reset(struct Qdisc *sch)
 
 	for (prio = 0; prio < q->bands; prio++)
 		qdisc_reset(q->queues[prio]);
+	sch->qstats.backlog = 0;
 	sch->q.qlen = 0;
 }
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8d2d8d953432..f18857febdad 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1235,8 +1235,10 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 			 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
 		err = qfq_change_agg(sch, cl, cl->agg->class_weight,
 				     qdisc_pkt_len(skb));
-		if (err)
-			return err;
+		if (err) {
+			cl->qstats.drops++;
+			return qdisc_drop(skb, sch);
+		}
 	}
 
 	err = qdisc_enqueue(skb, cl->qdisc);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 8c0508c0e287..91578bdd378c 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -97,6 +97,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	ret = qdisc_enqueue(skb, child);
 	if (likely(ret == NET_XMIT_SUCCESS)) {
+		qdisc_qstats_backlog_inc(sch, skb);
 		sch->q.qlen++;
 	} else if (net_xmit_drop_count(ret)) {
 		q->stats.pdrop++;
@@ -118,6 +119,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
 	skb = child->dequeue(child);
 	if (skb) {
 		qdisc_bstats_update(sch, skb);
+		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
 	} else {
 		if (!red_is_idling(&q->vars))
@@ -143,6 +145,7 @@ static unsigned int red_drop(struct Qdisc *sch)
 	if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
 		q->stats.other++;
 		qdisc_qstats_drop(sch);
+		sch->qstats.backlog -= len;
 		sch->q.qlen--;
 		return len;
 	}
@@ -158,6 +161,7 @@ static void red_reset(struct Qdisc *sch)
 	struct red_sched_data *q = qdisc_priv(sch);
 
 	qdisc_reset(q->qdisc);
+	sch->qstats.backlog = 0;
 	sch->q.qlen = 0;
 	red_restart(&q->vars);
 }
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 83b90b584fae..3161e491990b 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -207,6 +207,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return ret;
 	}
 
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 	return NET_XMIT_SUCCESS;
 }
@@ -217,6 +218,7 @@ static unsigned int tbf_drop(struct Qdisc *sch)
 	unsigned int len = 0;
 
 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+		sch->qstats.backlog -= len;
 		sch->q.qlen--;
 		qdisc_qstats_drop(sch);
 	}
@@ -263,6 +265,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 			q->t_c = now;
 			q->tokens = toks;
 			q->ptokens = ptoks;
+			qdisc_qstats_backlog_dec(sch, skb);
 			sch->q.qlen--;
 			qdisc_unthrottled(sch);
 			qdisc_bstats_update(sch, skb);
@@ -294,6 +297,7 @@ static void tbf_reset(struct Qdisc *sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
 	qdisc_reset(q->qdisc);
+	sch->qstats.backlog = 0;
 	sch->q.qlen = 0;
 	q->t_c = ktime_get_ns();
 	q->tokens = q->buffer;
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 8e3e769dc9ea..1ce724b87618 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -356,6 +356,9 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
 	if (cb->args[4] < cb->args[1])
 		goto next;
 
+	if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs))
+		goto next;
+
 	if (r->sdiag_family != AF_UNSPEC &&
 	    sk->sk_family != r->sdiag_family)
 		goto next;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 777d0324594a..67154b848aa9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4220,6 +4220,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 		info->sctpi_s_disable_fragments = sp->disable_fragments;
 		info->sctpi_s_v4mapped = sp->v4mapped;
 		info->sctpi_s_frag_interleave = sp->frag_interleave;
+		info->sctpi_s_type = sp->type;
 
 		return 0;
 	}
diff --git a/net/socket.c b/net/socket.c
index e7793f5601ae..a1bd16106625 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	struct mmsghdr __user *entry;
 	struct compat_mmsghdr __user *compat_entry;
 	struct msghdr msg_sys;
-	struct timespec end_time;
+	struct timespec64 end_time;
+	struct timespec64 timeout64;
 
 	if (timeout &&
 	    poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			flags |= MSG_DONTWAIT;
 
 		if (timeout) {
-			ktime_get_ts(timeout);
-			*timeout = timespec_sub(end_time, *timeout);
+			ktime_get_ts64(&timeout64);
+			*timeout = timespec64_to_timespec(
+					timespec64_sub(end_time, timeout64));
 			if (timeout->tv_sec < 0) {
 				timeout->tv_sec = timeout->tv_nsec = 0;
 				break;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..040ff627c18a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
  */
 struct rpc_cred *
 rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
-		int flags)
+		int flags, gfp_t gfp)
 {
 	LIST_HEAD(free);
 	struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	if (flags & RPCAUTH_LOOKUP_RCU)
 		return ERR_PTR(-ECHILD);
 
-	new = auth->au_ops->crcreate(auth, acred, flags);
+	new = auth->au_ops->crcreate(auth, acred, flags, gfp);
 	if (IS_ERR(new)) {
 		cred = new;
 		goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 		new = rpcauth_bind_new_cred(task, lookupflags);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	if (req->rq_cred != NULL)
-		put_rpccred(req->rq_cred);
+	put_rpccred(req->rq_cred);
 	req->rq_cred = new;
 	return 0;
 }
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 void
 put_rpccred(struct rpc_cred *cred)
 {
+	if (cred == NULL)
+		return;
 	/* Fast path for unhashed credentials */
 	if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
 		if (atomic_dec_and_test(&cred->cr_count))
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..54dd3fdead54 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
 struct rpc_cred *rpc_lookup_cred_nonblock(void)
 {
 	return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
 static struct rpc_cred *
 generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct generic_cred *gcred;
 
-	gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+	gcred = kmalloc(sizeof(*gcred), gfp);
 	if (gcred == NULL)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8d57..e64ae93d5b4f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
 	struct gss_cred	*cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 		__func__, from_kuid(&init_user_ns, acred->uid),
 		auth->au_flavor);
 
-	if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+	if (!(cred = kzalloc(sizeof(*cred), gfp)))
 		goto out_err;
 
 	rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1095be9c80ab..e085f5ae1548 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
 	struct rsc *found;
 
 	memset(&rsci, 0, sizeof(rsci));
-	if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
-		return NULL;
+	rsci.handle.data = handle->data;
+	rsci.handle.len = handle->len;
 	found = rsc_lookup(cd, &rsci);
-	rsc_free(&rsci);
 	if (!found)
 		return NULL;
 	if (cache_check(cd, &found->h, NULL))
@@ -857,8 +856,8 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
 		goto out;
 	if (svc_getnl(&buf->head[0]) != seq)
 		goto out;
-	/* trim off the mic at the end before returning */
-	xdr_buf_trim(buf, mic.len + 4);
+	/* trim off the mic and padding at the end before returning */
+	xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
 	stat = 0;
 out:
 	kfree(mic.data);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c22f..9f65452b7cbc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
 	struct unx_cred	*cred;
 	unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 			from_kuid(&init_user_ns, acred->uid),
 			from_kgid(&init_user_ns, acred->gid));
 
-	if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+	if (!(cred = kmalloc(sizeof(*cred), gfp)))
 		return ERR_PTR(-ENOMEM);
 
 	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf22df8..2808d550d273 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -446,16 +446,27 @@ out_no_rpciod:
 	return ERR_PTR(err);
 }
 
-struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 					struct rpc_xprt *xprt)
 {
 	struct rpc_clnt *clnt = NULL;
 	struct rpc_xprt_switch *xps;
 
-	xps = xprt_switch_alloc(xprt, GFP_KERNEL);
-	if (xps == NULL)
-		return ERR_PTR(-ENOMEM);
-
+	if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
+		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		xps = args->bc_xprt->xpt_bc_xps;
+		xprt_switch_get(xps);
+	} else {
+		xps = xprt_switch_alloc(xprt, GFP_KERNEL);
+		if (xps == NULL) {
+			xprt_put(xprt);
+			return ERR_PTR(-ENOMEM);
+		}
+		if (xprt->bc_xprt) {
+			xprt_switch_get(xps);
+			xprt->bc_xprt->xpt_bc_xps = xps;
+		}
+	}
 	clnt = rpc_new_client(args, xps, xprt, NULL);
 	if (IS_ERR(clnt))
 		return clnt;
@@ -483,7 +494,6 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
 
 	return clnt;
 }
-EXPORT_SYMBOL_GPL(rpc_create_xprt);
 
 /**
  * rpc_create - create an RPC client and transport with one call
@@ -509,6 +519,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	};
 	char servername[48];
 
+	if (args->bc_xprt) {
+		WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+		xprt = args->bc_xprt->xpt_bc_xprt;
+		if (xprt) {
+			xprt_get(xprt);
+			return rpc_create_xprt(args, xprt);
+		}
+	}
+
 	if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
 		xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
 	if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
@@ -1414,6 +1433,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 EXPORT_SYMBOL_GPL(rpc_max_payload);
 
 /**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+	size_t ret;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	ret = xprt->ops->bc_maxpayload(xprt);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
+/**
  * rpc_get_timeout - Get timeout for transport in units of HZ
  * @clnt: RPC client to query
  */
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 7422f28818b2..4f01f63102ee 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -136,6 +136,8 @@ static void svc_xprt_free(struct kref *kref)
 	/* See comment on corresponding get in xs_setup_bc_tcp(): */
 	if (xprt->xpt_bc_xprt)
 		xprt_put(xprt->xpt_bc_xprt);
+	if (xprt->xpt_bc_xps)
+		xprt_switch_put(xprt->xpt_bc_xps);
 	xprt->xpt_ops->xpo_free(xprt);
 	module_put(owner);
 }
@@ -244,13 +246,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
 	svc_xprt_received(new);
 }
 
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 		    struct net *net, const int family,
 		    const unsigned short port, int flags)
 {
 	struct svc_xprt_class *xcl;
 
-	dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
 	spin_lock(&svc_xprt_class_lock);
 	list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
 		struct svc_xprt *newxprt;
@@ -274,12 +275,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 	}
  err:
 	spin_unlock(&svc_xprt_class_lock);
-	dprintk("svc: transport %s not found\n", xprt_name);
-
 	/* This errno is exposed to user space.  Provide a reasonable
 	 * perror msg for a bad transport. */
 	return -EPROTONOSUPPORT;
 }
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+		    struct net *net, const int family,
+		    const unsigned short port, int flags)
+{
+	int err;
+
+	dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+	err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+	if (err == -EPROTONOSUPPORT) {
+		request_module("svc%s", xprt_name);
+		err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+	}
+	if (err)
+		dprintk("svc: transport %s not found, err %d\n",
+			xprt_name, err);
+	return err;
+}
 EXPORT_SYMBOL_GPL(svc_create_xprt);
 
 /*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 6bdb3865212d..c4f3cc0c0775 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 		xdr_set_iov(xdr, buf->head, buf->len);
 	else if (buf->page_len != 0)
 		xdr_set_page_base(xdr, 0, buf->len);
+	else
+		xdr_set_iov(xdr, buf->head, buf->len);
 	if (p != NULL && p > xdr->p && xdr->end >= p) {
 		xdr->nwords -= p - xdr->p;
 		xdr->p = p;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640eeb5..87762d976b63 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
 }
 
 /**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	size_t maxmsg;
+
+	maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+	return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+/**
  * rpcrdma_bc_marshal_reply - Send backwards direction reply
  * @rqst: buffer containing RPC reply data
  *
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index b289e106540b..6326ebe8b595 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -35,10 +35,71 @@
 /* Maximum scatter/gather per FMR */
 #define RPCRDMA_MAX_FMR_SGES	(64)
 
+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS		(WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+	fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+	return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+	struct workqueue_struct *wq;
+
+	if (!fmr_recovery_wq)
+		return;
+
+	wq = fmr_recovery_wq;
+	fmr_recovery_wq = NULL;
+	destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+	LIST_HEAD(l);
+
+	list_add(&mw->fmr.fmr->list, &l);
+	return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+	struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+					    mw_work);
+	struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+	__fmr_unmap(mw);
+	rpcrdma_put_mw(r_xprt, mw);
+	return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+	INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+	queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
 static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	    struct rpcrdma_create_data_internal *cdata)
 {
+	rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+						      RPCRDMA_MAX_DATA_SEGS /
+						      RPCRDMA_MAX_FMR_SGES));
 	return 0;
 }
 
@@ -48,7 +109,7 @@ static size_t
 fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
 	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-		     rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+		     RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
 }
 
 static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
 		if (IS_ERR(r->fmr.fmr))
 			goto out_fmr_err;
 
+		r->mw_xprt = r_xprt;
 		list_add(&r->mw_list, &buf->rb_mws);
 		list_add(&r->mw_all, &buf->rb_all);
 	}
@@ -104,15 +166,6 @@ out:
 	return rc;
 }
 
-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
-	LIST_HEAD(l);
-
-	list_add(&r->fmr.fmr->list, &l);
-	return ib_unmap_fmr(&l);
-}
-
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -183,15 +236,10 @@ static void
 __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
 	struct ib_device *device = r_xprt->rx_ia.ri_device;
-	struct rpcrdma_mw *mw = seg->rl_mw;
 	int nsegs = seg->mr_nsegs;
 
-	seg->rl_mw = NULL;
-
 	while (nsegs--)
 		rpcrdma_unmap_one(device, seg++);
-
-	rpcrdma_put_mw(r_xprt, mw);
 }
 
 /* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		seg = &req->rl_segments[i];
 
 		__fmr_dma_unmap(r_xprt, seg);
+		rpcrdma_put_mw(r_xprt, seg->rl_mw);
 
 		i += seg->mr_nsegs;
 		seg->mr_nsegs = 0;
+		seg->rl_mw = NULL;
 	}
 
 	req->rl_nchunks = 0;
 }
 
-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
  */
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		  bool sync)
 {
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_mr_seg *seg1 = seg;
-	struct rpcrdma_mw *mw = seg1->rl_mw;
-	int rc, nsegs = seg->mr_nsegs;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mw *mw;
+	unsigned int i;
 
-	dprintk("RPC:       %s: FMR %p\n", __func__, mw);
+	for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+		seg = &req->rl_segments[i];
+		mw = seg->rl_mw;
 
-	seg1->rl_mw = NULL;
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia->ri_device, seg++);
-	rc = __fmr_unmap(mw);
-	if (rc)
-		goto out_err;
-	rpcrdma_put_mw(r_xprt, mw);
-	return nsegs;
+		if (sync) {
+			/* ORDER */
+			__fmr_unmap(mw);
+			__fmr_dma_unmap(r_xprt, seg);
+			rpcrdma_put_mw(r_xprt, mw);
+		} else {
+			__fmr_dma_unmap(r_xprt, seg);
+			__fmr_queue_recovery(mw);
+		}
 
-out_err:
-	/* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
-	 * will attempt to release it when the transport is destroyed.
-	 */
-	dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
-	return nsegs;
+		i += seg->mr_nsegs;
+		seg->mr_nsegs = 0;
+		seg->rl_mw = NULL;
+	}
 }
 
 static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
 	.ro_unmap_sync			= fmr_op_unmap_sync,
-	.ro_unmap			= fmr_op_unmap,
+	.ro_unmap_safe			= fmr_op_unmap_safe,
 	.ro_open			= fmr_op_open,
 	.ro_maxpages			= fmr_op_maxpages,
 	.ro_init			= fmr_op_init,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c250924a9fd3..c0947544babe 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
 	destroy_workqueue(wq);
 }
 
+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+	struct rpcrdma_frmr *f = &r->frmr;
+	int rc;
+
+	rc = ib_dereg_mr(f->fr_mr);
+	if (rc) {
+		pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+			rc, r);
+		return rc;
+	}
+
+	f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+			       ia->ri_max_frmr_depth);
+	if (IS_ERR(f->fr_mr)) {
+		pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+			PTR_ERR(f->fr_mr), r);
+		return PTR_ERR(f->fr_mr);
+	}
+
+	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+	f->fr_state = FRMR_IS_INVALID;
+	return 0;
+}
+
+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_frmr *f = &mw->frmr;
+	int rc;
+
+	rc = __frwr_reset_mr(ia, mw);
+	ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+	if (rc)
+		return;
+
+	rpcrdma_put_mw(r_xprt, mw);
+}
+
 /* Deferred reset of a single FRMR. Generate a fresh rkey by
  * replacing the MR.
  *
@@ -109,26 +150,10 @@ static void
 __frwr_recovery_worker(struct work_struct *work)
 {
 	struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
-					    frmr.fr_work);
-	struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
-	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
-	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
-	if (ib_dereg_mr(r->frmr.fr_mr))
-		goto out_fail;
+					    mw_work);
 
-	r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
-	if (IS_ERR(r->frmr.fr_mr))
-		goto out_fail;
-
-	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
-	r->frmr.fr_state = FRMR_IS_INVALID;
-	rpcrdma_put_mw(r_xprt, r);
+	__frwr_reset_and_unmap(r->mw_xprt, r);
 	return;
-
-out_fail:
-	pr_warn("RPC:       %s: FRMR %p unrecovered\n",
-		__func__, r);
 }
 
 /* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
 static void
 __frwr_queue_recovery(struct rpcrdma_mw *r)
 {
-	INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
-	queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+	INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+	queue_work(frwr_recovery_wq, &r->mw_work);
 }
 
 static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
 	if (IS_ERR(f->fr_mr))
 		goto out_mr_err;
 
-	f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
-	if (!f->sg)
+	f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+	if (!f->fr_sg)
 		goto out_list_err;
 
-	sg_init_table(f->sg, depth);
+	sg_init_table(f->fr_sg, depth);
 
 	init_completion(&f->fr_linv_done);
 
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
 	if (rc)
 		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
 			__func__, rc);
-	kfree(r->frmr.sg);
+	kfree(r->frmr.fr_sg);
 }
 
 static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 					       depth;
 	}
 
+	rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+						      RPCRDMA_MAX_DATA_SEGS /
+						      ia->ri_max_frmr_depth));
 	return 0;
 }
 
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
 	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+		     RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
 }
 
 static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
 			return rc;
 		}
 
+		r->mw_xprt = r_xprt;
 		list_add(&r->mw_list, &buf->rb_mws);
 		list_add(&r->mw_all, &buf->rb_all);
-		r->frmr.fr_xprt = r_xprt;
 	}
 
 	return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
 	for (i = 0; i < nsegs;) {
 		if (seg->mr_page)
-			sg_set_page(&frmr->sg[i],
+			sg_set_page(&frmr->fr_sg[i],
 				    seg->mr_page,
 				    seg->mr_len,
 				    offset_in_page(seg->mr_offset));
 		else
-			sg_set_buf(&frmr->sg[i], seg->mr_offset,
+			sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
 				   seg->mr_len);
 
 		++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 			break;
 	}
-	frmr->sg_nents = i;
+	frmr->fr_nents = i;
+	frmr->fr_dir = direction;
 
-	dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+	dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
 	if (!dma_nents) {
 		pr_err("RPC:       %s: failed to dma map sg %p sg_nents %u\n",
-		       __func__, frmr->sg, frmr->sg_nents);
+		       __func__, frmr->fr_sg, frmr->fr_nents);
 		return -ENOMEM;
 	}
 
-	n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
-	if (unlikely(n != frmr->sg_nents)) {
+	n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+	if (unlikely(n != frmr->fr_nents)) {
 		pr_err("RPC:       %s: failed to map mr %p (%u/%u)\n",
-		       __func__, frmr->fr_mr, n, frmr->sg_nents);
+		       __func__, frmr->fr_mr, n, frmr->fr_nents);
 		rc = n < 0 ? n : -EINVAL;
 		goto out_senderr;
 	}
 
 	dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-		__func__, mw, frmr->sg_nents, mr->length);
+		__func__, mw, frmr->fr_nents, mr->length);
 
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	if (rc)
 		goto out_senderr;
 
-	seg1->mr_dir = direction;
 	seg1->rl_mw = mw;
 	seg1->mr_rkey = mr->rkey;
 	seg1->mr_base = mr->iova;
-	seg1->mr_nsegs = frmr->sg_nents;
+	seg1->mr_nsegs = frmr->fr_nents;
 	seg1->mr_len = mr->length;
 
-	return frmr->sg_nents;
+	return frmr->fr_nents;
 
 out_senderr:
 	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-	ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
 	__frwr_queue_recovery(mw);
 	return rc;
 }
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
 	return invalidate_wr;
 }
 
-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-		 int rc)
-{
-	struct ib_device *device = r_xprt->rx_ia.ri_device;
-	struct rpcrdma_mw *mw = seg->rl_mw;
-	struct rpcrdma_frmr *f = &mw->frmr;
-
-	seg->rl_mw = NULL;
-
-	ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
-
-	if (!rc)
-		rpcrdma_put_mw(r_xprt, mw);
-	else
-		__frwr_queue_recovery(mw);
-}
-
 /* Invalidate all memory regions that were registered for "req".
  *
  * Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	struct rpcrdma_mr_seg *seg;
 	unsigned int i, nchunks;
 	struct rpcrdma_frmr *f;
+	struct rpcrdma_mw *mw;
 	int rc;
 
 	dprintk("RPC:       %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * unless ri_id->qp is a valid pointer.
 	 */
 	rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
-	if (rc) {
-		pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
-		rdma_disconnect(ia->ri_id);
-		goto unmap;
-	}
+	if (rc)
+		goto reset_mrs;
 
 	wait_for_completion(&f->fr_linv_done);
 
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 unmap:
 	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
 		seg = &req->rl_segments[i];
+		mw = seg->rl_mw;
+		seg->rl_mw = NULL;
 
-		__frwr_dma_unmap(r_xprt, seg, rc);
+		ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+				f->fr_dir);
+		rpcrdma_put_mw(r_xprt, mw);
 
 		i += seg->mr_nsegs;
 		seg->mr_nsegs = 0;
 	}
 
 	req->rl_nchunks = 0;
-}
+	return;
 
-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_mw *mw = seg1->rl_mw;
-	struct rpcrdma_frmr *frmr = &mw->frmr;
-	struct ib_send_wr *invalidate_wr, *bad_wr;
-	int rc, nsegs = seg->mr_nsegs;
+reset_mrs:
+	pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
 
-	dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
+	/* Find and reset the MRs in the LOCAL_INV WRs that did not
+	 * get posted. This is synchronous, and slow.
+	 */
+	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+		seg = &req->rl_segments[i];
+		mw = seg->rl_mw;
+		f = &mw->frmr;
 
-	seg1->rl_mw = NULL;
-	frmr->fr_state = FRMR_IS_INVALID;
-	invalidate_wr = &mw->frmr.fr_invwr;
+		if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+			__frwr_reset_mr(ia, mw);
+			bad_wr = bad_wr->next;
+		}
 
-	memset(invalidate_wr, 0, sizeof(*invalidate_wr));
-	frmr->fr_cqe.done = frwr_wc_localinv;
-	invalidate_wr->wr_cqe = &frmr->fr_cqe;
-	invalidate_wr->opcode = IB_WR_LOCAL_INV;
-	invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
-	DECR_CQCOUNT(&r_xprt->rx_ep);
+		i += seg->mr_nsegs;
+	}
+	goto unmap;
+}
 
-	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
-	read_lock(&ia->ri_qplock);
-	rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
-	read_unlock(&ia->ri_qplock);
-	if (rc)
-		goto out_err;
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		   bool sync)
+{
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mw *mw;
+	unsigned int i;
 
-	rpcrdma_put_mw(r_xprt, mw);
-	return nsegs;
+	for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+		seg = &req->rl_segments[i];
+		mw = seg->rl_mw;
 
-out_err:
-	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-	__frwr_queue_recovery(mw);
-	return nsegs;
+		if (sync)
+			__frwr_reset_and_unmap(r_xprt, mw);
+		else
+			__frwr_queue_recovery(mw);
+
+		i += seg->mr_nsegs;
+		seg->mr_nsegs = 0;
+		seg->rl_mw = NULL;
+	}
 }
 
 static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
 	.ro_unmap_sync			= frwr_op_unmap_sync,
-	.ro_unmap			= frwr_op_unmap,
+	.ro_unmap_safe			= frwr_op_unmap_safe,
 	.ro_open			= frwr_op_open,
 	.ro_maxpages			= frwr_op_maxpages,
 	.ro_init			= frwr_op_init,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 481b9b6f4a15..3750596cc432 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 		       __func__, PTR_ERR(mr));
 		return -ENOMEM;
 	}
-
 	ia->ri_dma_mr = mr;
+
+	rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+						      RPCRDMA_MAX_DATA_SEGS,
+						      RPCRDMA_MAX_HDR_SEGS));
 	return 0;
 }
 
@@ -47,7 +50,7 @@ static size_t
 physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
 	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-		     rpcrdma_max_segments(r_xprt));
+		     RPCRDMA_MAX_HDR_SEGS);
 }
 
 static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	return 1;
 }
 
-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
-	rpcrdma_unmap_one(ia->ri_device, seg);
-	return 1;
-}
-
 /* DMA unmap all memory regions that were mapped for "req".
  */
 static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		rpcrdma_unmap_one(device, &req->rl_segments[i++]);
 }
 
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		       bool sync)
+{
+	physical_op_unmap_sync(r_xprt, req);
+}
+
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
 	.ro_unmap_sync			= physical_op_unmap_sync,
-	.ro_unmap			= physical_op_unmap,
+	.ro_unmap_safe			= physical_op_unmap_safe,
 	.ro_open			= physical_op_open,
 	.ro_maxpages			= physical_op_maxpages,
 	.ro_init			= physical_op_init,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 888823bb6dae..35a81096e83d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
 	rpcrdma_replych
 };
 
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
-	"pure inline",	/* no chunks */
-	" read chunk",	/* some argument via rdma read */
-	"*read chunk",	/* entire request via rdma read */
-	"write chunk",	/* some result via rdma write */
+	"inline",	/* no chunks */
+	"read list",	/* some argument via rdma read */
+	"*read list",	/* entire request via rdma read */
+	"write list",	/* some result via rdma write */
 	"reply chunk"	/* entire reply via rdma write */
 };
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+	unsigned int size;
+
+	/* Fixed header fields and list discriminators */
+	size = RPCRDMA_HDRLEN_MIN;
+
+	/* Maximum Read list size */
+	maxsegs += 2;	/* segment for head and tail buffers */
+	size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+	/* Minimal Read chunk size */
+	size += sizeof(__be32);	/* segment count */
+	size += sizeof(struct rpcrdma_segment);
+	size += sizeof(__be32);	/* list discriminator */
+
+	dprintk("RPC:       %s: max call header size = %u\n",
+		__func__, size);
+	return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message.  The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+	unsigned int size;
+
+	/* Fixed header fields and list discriminators */
+	size = RPCRDMA_HDRLEN_MIN;
+
+	/* Maximum Write list size */
+	maxsegs += 2;	/* segment for head and tail buffers */
+	size = sizeof(__be32);		/* segment count */
+	size += maxsegs * sizeof(struct rpcrdma_segment);
+	size += sizeof(__be32);	/* list discriminator */
+
+	dprintk("RPC:       %s: max reply header size = %u\n",
+		__func__, size);
+	return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+				  struct rpcrdma_create_data_internal *cdata,
+				  unsigned int maxsegs)
+{
+	ia->ri_max_inline_write = cdata->inline_wsize -
+				  rpcrdma_max_call_header_size(maxsegs);
+	ia->ri_max_inline_read = cdata->inline_rsize -
+				 rpcrdma_max_reply_header_size(maxsegs);
+}
 
 /* The client can send a request inline as long as the RPCRDMA header
  * plus the RPC call fit under the transport's inline limit. If the
  * combined call message size exceeds that limit, the client must use
  * the read chunk list for this operation.
  */
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+				struct rpc_rqst *rqst)
 {
-	unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+	return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
 }
 
 /* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
  * limit, the client must provide a write list or a reply chunk for
  * this request.
  */
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+				   struct rpc_rqst *rqst)
 {
-	unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+	return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
 }
 
 static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	return n;
 }
 
-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- *   Assume check against THRESHOLD has been done, and chunks are required.
- *   Assume only encoding one list entry for read|write chunks. The NFSv3
- *     protocol is simple enough to allow this as it only has a single "bulk
- *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+	*iptr++ = cpu_to_be32(seg->mr_rkey);
+	*iptr++ = cpu_to_be32(seg->mr_len);
+	return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
  *
  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
  *
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
  *   N elements, position P (same P for all chunks of same arg!):
  *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
  *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+			 struct rpcrdma_req *req, struct rpc_rqst *rqst,
+			 __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+	struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+	unsigned int pos;
+	int n, nsegs;
+
+	if (rtype == rpcrdma_noch) {
+		*iptr++ = xdr_zero;	/* item not present */
+		return iptr;
+	}
+
+	pos = rqst->rq_snd_buf.head[0].iov_len;
+	if (rtype == rpcrdma_areadch)
+		pos = 0;
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+				     RPCRDMA_MAX_SEGS - req->rl_nchunks);
+	if (nsegs < 0)
+		return ERR_PTR(nsegs);
+
+	do {
+		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+		if (n <= 0)
+			return ERR_PTR(n);
+
+		*iptr++ = xdr_one;	/* item present */
+
+		/* All read segments in this chunk
+		 * have the same "position".
+		 */
+		*iptr++ = cpu_to_be32(pos);
+		iptr = xdr_encode_rdma_segment(iptr, seg);
+
+		dprintk("RPC: %5u %s: read segment pos %u "
+			"%d@0x%016llx:0x%08x (%s)\n",
+			rqst->rq_task->tk_pid, __func__, pos,
+			seg->mr_len, (unsigned long long)seg->mr_base,
+			seg->mr_rkey, n < nsegs ? "more" : "last");
+
+		r_xprt->rx_stats.read_chunk_count++;
+		req->rl_nchunks++;
+		seg += n;
+		nsegs -= n;
+	} while (nsegs);
+	req->rl_nextseg = seg;
+
+	/* Finish Read list */
+	*iptr++ = xdr_zero;	/* Next item not present */
+	return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
  *  Write chunklist (a list of (one) counted array):
  *   N elements:
  *    1 - N - HLOO - HLOO - ... - HLOO - 0
  *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+			  struct rpc_rqst *rqst, __be32 *iptr,
+			  enum rpcrdma_chunktype wtype)
+{
+	struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+	int n, nsegs, nchunks;
+	__be32 *segcount;
+
+	if (wtype != rpcrdma_writech) {
+		*iptr++ = xdr_zero;	/* no Write list present */
+		return iptr;
+	}
+
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+				     rqst->rq_rcv_buf.head[0].iov_len,
+				     wtype, seg,
+				     RPCRDMA_MAX_SEGS - req->rl_nchunks);
+	if (nsegs < 0)
+		return ERR_PTR(nsegs);
+
+	*iptr++ = xdr_one;	/* Write list present */
+	segcount = iptr++;	/* save location of segment count */
+
+	nchunks = 0;
+	do {
+		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+		if (n <= 0)
+			return ERR_PTR(n);
+
+		iptr = xdr_encode_rdma_segment(iptr, seg);
+
+		dprintk("RPC: %5u %s: write segment "
+			"%d@0x016%llx:0x%08x (%s)\n",
+			rqst->rq_task->tk_pid, __func__,
+			seg->mr_len, (unsigned long long)seg->mr_base,
+			seg->mr_rkey, n < nsegs ? "more" : "last");
+
+		r_xprt->rx_stats.write_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		req->rl_nchunks++;
+		nchunks++;
+		seg   += n;
+		nsegs -= n;
+	} while (nsegs);
+	req->rl_nextseg = seg;
+
+	/* Update count of segments in this Write chunk */
+	*segcount = cpu_to_be32(nchunks);
+
+	/* Finish Write list */
+	*iptr++ = xdr_zero;	/* Next item not present */
+	return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
  *  Reply chunk (a counted array):
  *   N elements:
  *    1 - N - HLOO - HLOO - ... - HLOO
  *
- * Returns positive RPC/RDMA header size, or negative errno.
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
  */
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
-		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+			   struct rpcrdma_req *req, struct rpc_rqst *rqst,
+			   __be32 *iptr, enum rpcrdma_chunktype wtype)
 {
-	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-	int n, nsegs, nchunks = 0;
-	unsigned int pos;
-	struct rpcrdma_mr_seg *seg = req->rl_segments;
-	struct rpcrdma_read_chunk *cur_rchunk = NULL;
-	struct rpcrdma_write_array *warray = NULL;
-	struct rpcrdma_write_chunk *cur_wchunk = NULL;
-	__be32 *iptr = headerp->rm_body.rm_chunks;
-	int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
-	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
-		/* a read chunk - server will RDMA Read our memory */
-		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
-	} else {
-		/* a write or reply chunk - server will RDMA Write our memory */
-		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
-		if (type == rpcrdma_replych)
-			*iptr++ = xdr_zero;	/* a NULL write chunk list */
-		warray = (struct rpcrdma_write_array *) iptr;
-		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
-	}
+	struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+	int n, nsegs, nchunks;
+	__be32 *segcount;
 
-	if (type == rpcrdma_replych || type == rpcrdma_areadch)
-		pos = 0;
-	else
-		pos = target->head[0].iov_len;
+	if (wtype != rpcrdma_replych) {
+		*iptr++ = xdr_zero;	/* no Reply chunk present */
+		return iptr;
+	}
 
-	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+				     RPCRDMA_MAX_SEGS - req->rl_nchunks);
 	if (nsegs < 0)
-		return nsegs;
+		return ERR_PTR(nsegs);
 
-	map = r_xprt->rx_ia.ri_ops->ro_map;
+	*iptr++ = xdr_one;	/* Reply chunk present */
+	segcount = iptr++;	/* save location of segment count */
+
+	nchunks = 0;
 	do {
-		n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+		n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
 		if (n <= 0)
-			goto out;
-		if (cur_rchunk) {	/* read */
-			cur_rchunk->rc_discrim = xdr_one;
-			/* all read chunks have the same "position" */
-			cur_rchunk->rc_position = cpu_to_be32(pos);
-			cur_rchunk->rc_target.rs_handle =
-						cpu_to_be32(seg->mr_rkey);
-			cur_rchunk->rc_target.rs_length =
-						cpu_to_be32(seg->mr_len);
-			xdr_encode_hyper(
-					(__be32 *)&cur_rchunk->rc_target.rs_offset,
-					seg->mr_base);
-			dprintk("RPC:       %s: read chunk "
-				"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
-				seg->mr_len, (unsigned long long)seg->mr_base,
-				seg->mr_rkey, pos, n < nsegs ? "more" : "last");
-			cur_rchunk++;
-			r_xprt->rx_stats.read_chunk_count++;
-		} else {		/* write/reply */
-			cur_wchunk->wc_target.rs_handle =
-						cpu_to_be32(seg->mr_rkey);
-			cur_wchunk->wc_target.rs_length =
-						cpu_to_be32(seg->mr_len);
-			xdr_encode_hyper(
-					(__be32 *)&cur_wchunk->wc_target.rs_offset,
-					seg->mr_base);
-			dprintk("RPC:       %s: %s chunk "
-				"elem %d@0x%llx:0x%x (%s)\n", __func__,
-				(type == rpcrdma_replych) ? "reply" : "write",
-				seg->mr_len, (unsigned long long)seg->mr_base,
-				seg->mr_rkey, n < nsegs ? "more" : "last");
-			cur_wchunk++;
-			if (type == rpcrdma_replych)
-				r_xprt->rx_stats.reply_chunk_count++;
-			else
-				r_xprt->rx_stats.write_chunk_count++;
-			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
-		}
+			return ERR_PTR(n);
+
+		iptr = xdr_encode_rdma_segment(iptr, seg);
+
+		dprintk("RPC: %5u %s: reply segment "
+			"%d@0x%016llx:0x%08x (%s)\n",
+			rqst->rq_task->tk_pid, __func__,
+			seg->mr_len, (unsigned long long)seg->mr_base,
+			seg->mr_rkey, n < nsegs ? "more" : "last");
+
+		r_xprt->rx_stats.reply_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		req->rl_nchunks++;
 		nchunks++;
 		seg   += n;
 		nsegs -= n;
 	} while (nsegs);
+	req->rl_nextseg = seg;
 
-	/* success. all failures return above */
-	req->rl_nchunks = nchunks;
-
-	/*
-	 * finish off header. If write, marshal discrim and nchunks.
-	 */
-	if (cur_rchunk) {
-		iptr = (__be32 *) cur_rchunk;
-		*iptr++ = xdr_zero;	/* finish the read chunk list */
-		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
-		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
-	} else {
-		warray->wc_discrim = xdr_one;
-		warray->wc_nchunks = cpu_to_be32(nchunks);
-		iptr = (__be32 *) cur_wchunk;
-		if (type == rpcrdma_writech) {
-			*iptr++ = xdr_zero; /* finish the write chunk list */
-			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
-		}
-	}
-
-	/*
-	 * Return header size.
-	 */
-	return (unsigned char *)iptr - (unsigned char *)headerp;
+	/* Update count of segments in the Reply chunk */
+	*segcount = cpu_to_be32(nchunks);
 
-out:
-	for (pos = 0; nchunks--;)
-		pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
-						      &req->rl_segments[pos]);
-	return n;
+	return iptr;
 }
 
 /*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
  * Marshal a request: the primary job of this routine is to choose
  * the transfer modes. See comments below.
  *
- * Uses multiple RDMA IOVs for a request:
- *  [0] -- RPC RDMA header, which uses memory from the *start* of the
- *         preregistered buffer that already holds the RPC data in
- *         its middle.
- *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- *  [2] -- optional padding.
- *  [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ *  [0] -- RPC RDMA header
+ *  [1] -- the RPC header/data
  *
  * Returns zero on success, otherwise a negative errno.
  */
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	struct rpc_xprt *xprt = rqst->rq_xprt;
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-	char *base;
-	size_t rpclen;
-	ssize_t hdrlen;
 	enum rpcrdma_chunktype rtype, wtype;
 	struct rpcrdma_msg *headerp;
+	ssize_t hdrlen;
+	size_t rpclen;
+	__be32 *iptr;
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
 		return rpcrdma_bc_marshal_reply(rqst);
 #endif
 
-	/*
-	 * rpclen gets amount of data in first buffer, which is the
-	 * pre-registered buffer.
-	 */
-	base = rqst->rq_svec[0].iov_base;
-	rpclen = rqst->rq_svec[0].iov_len;
-
 	headerp = rdmab_to_msg(req->rl_rdmabuf);
 	/* don't byte-swap XID, it's already done in request */
 	headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	/*
 	 * Chunks needed for results?
 	 *
-	 * o Read ops return data as write chunk(s), header as inline.
 	 * o If the expected result is under the inline threshold, all ops
 	 *   return as inline.
+	 * o Large read ops return data as write chunk(s), header as
+	 *   inline.
 	 * o Large non-read ops return as a single reply chunk.
 	 */
-	if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-		wtype = rpcrdma_writech;
-	else if (rpcrdma_results_inline(rqst))
+	if (rpcrdma_results_inline(r_xprt, rqst))
 		wtype = rpcrdma_noch;
+	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+		wtype = rpcrdma_writech;
 	else
 		wtype = rpcrdma_replych;
 
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * that both has a data payload, and whose non-data arguments
 	 * by themselves are larger than the inline threshold.
 	 */
-	if (rpcrdma_args_inline(rqst)) {
+	if (rpcrdma_args_inline(r_xprt, rqst)) {
 		rtype = rpcrdma_noch;
+		rpcrdma_inline_pullup(rqst);
+		rpclen = rqst->rq_svec[0].iov_len;
 	} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
 		rtype = rpcrdma_readch;
+		rpclen = rqst->rq_svec[0].iov_len;
+		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
 	} else {
 		r_xprt->rx_stats.nomsg_call_count++;
 		headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 		rpclen = 0;
 	}
 
-	/* The following simplification is not true forever */
-	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
-		wtype = rpcrdma_noch;
-	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
-		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
-			__func__);
-		return -EIO;
-	}
-
-	hdrlen = RPCRDMA_HDRLEN_MIN;
-
-	/*
-	 * Pull up any extra send data into the preregistered buffer.
-	 * When padding is in use and applies to the transfer, insert
-	 * it and change the message type.
+	/* This implementation supports the following combinations
+	 * of chunk lists in one RPC-over-RDMA Call message:
+	 *
+	 *   - Read list
+	 *   - Write list
+	 *   - Reply chunk
+	 *   - Read list + Reply chunk
+	 *
+	 * It might not yet support the following combinations:
+	 *
+	 *   - Read list + Write list
+	 *
+	 * It does not support the following combinations:
+	 *
+	 *   - Write list + Reply chunk
+	 *   - Read list + Write list + Reply chunk
+	 *
+	 * This implementation supports only a single chunk in each
+	 * Read or Write list. Thus for example the client cannot
+	 * send a Call message with a Position Zero Read chunk and a
+	 * regular Read chunk at the same time.
 	 */
-	if (rtype == rpcrdma_noch) {
-
-		rpcrdma_inline_pullup(rqst);
-
-		headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-		headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-		headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-		/* new length after pullup */
-		rpclen = rqst->rq_svec[0].iov_len;
-	} else if (rtype == rpcrdma_readch)
-		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
-	if (rtype != rpcrdma_noch) {
-		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
-					       headerp, rtype);
-		wtype = rtype;	/* simplify dprintk */
-
-	} else if (wtype != rpcrdma_noch) {
-		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
-					       headerp, wtype);
-	}
-	if (hdrlen < 0)
-		return hdrlen;
+	req->rl_nchunks = 0;
+	req->rl_nextseg = req->rl_segments;
+	iptr = headerp->rm_body.rm_chunks;
+	iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+	if (IS_ERR(iptr))
+		goto out_unmap;
+	iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+	if (IS_ERR(iptr))
+		goto out_unmap;
+	iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+	if (IS_ERR(iptr))
+		goto out_unmap;
+	hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
+
+	if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+		goto out_overflow;
+
+	dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+		rqst->rq_task->tk_pid, __func__,
+		transfertypes[rtype], transfertypes[wtype],
+		hdrlen, rpclen);
 
-	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
-		" headerp 0x%p base 0x%p lkey 0x%x\n",
-		__func__, transfertypes[wtype], hdrlen, rpclen,
-		headerp, base, rdmab_lkey(req->rl_rdmabuf));
-
-	/*
-	 * initialize send_iov's - normally only two: rdma chunk header and
-	 * single preregistered RPC header buffer, but if padding is present,
-	 * then use a preregistered (and zeroed) pad buffer between the RPC
-	 * header and any write data. In all non-rdma cases, any following
-	 * data has been copied into the RPC header buffer.
-	 */
 	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
 	req->rl_send_iov[0].length = hdrlen;
 	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 
 	req->rl_niovs = 2;
 	return 0;
+
+out_overflow:
+	pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+		hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
+	/* Terminate this RPC. Chunks registered above will be
+	 * released by xprt_release -> xprt_rmda_free .
+	 */
+	return -EIO;
+
+out_unmap:
+	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+	return PTR_ERR(iptr);
 }
 
 /*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 765bca47c74d..0ba9887f3e22 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
 	return (__be32 *)&ary->wc_array[nchunks];
 }
 
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
+	struct rpcrdma_msg *rmsgp;
 	__be32 *va, *vaend;
 	unsigned int len;
 	u32 hdr_len;
 
 	/* Verify that there's enough bytes for header + something */
-	if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
 		dprintk("svcrdma: header too short = %d\n",
-			rqstp->rq_arg.len);
+			rq_arg->len);
 		return -EINVAL;
 	}
 
+	rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
 	if (rmsgp->rm_vers != rpcrdma_version) {
 		dprintk("%s: bad version %u\n", __func__,
 			be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
 
 		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-		rqstp->rq_arg.head[0].iov_base = va;
+		rq_arg->head[0].iov_base = va;
 		len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-		rqstp->rq_arg.head[0].iov_len -= len;
-		if (len > rqstp->rq_arg.len)
+		rq_arg->head[0].iov_len -= len;
+		if (len > rq_arg->len)
 			return -EINVAL;
 		return len;
 	default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 	 * chunk list and a reply chunk list.
 	 */
 	va = &rmsgp->rm_body.rm_chunks[0];
-	vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+	vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
 	va = decode_read_list(va, vaend);
 	if (!va) {
 		dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
 		return -EINVAL;
 	}
 
-	rqstp->rq_arg.head[0].iov_base = va;
+	rq_arg->head[0].iov_base = va;
 	hdr_len = (unsigned long)va - (unsigned long)rmsgp;
-	rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+	rq_arg->head[0].iov_len -= hdr_len;
 	return hdr_len;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 3b24a646eb46..2c25606f2561 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -281,7 +281,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
 	}
 	atomic_inc(&xprt->sc_dma_used);
 
-	n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+	n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
 	if (unlikely(n != frmr->sg_nents)) {
 		pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
 		       frmr->mr, n, frmr->sg_nents);
@@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	head->arg.len = rqstp->rq_arg.len;
 	head->arg.buflen = rqstp->rq_arg.buflen;
 
-	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-	position = be32_to_cpu(ch->rc_position);
-
 	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	position = be32_to_cpu(ch->rc_position);
 	if (position == 0) {
 		head->arg.pages = &head->pages[0];
 		page_offset = head->byte_len;
@@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	if (page_offset & 3) {
 		u32 pad = 4 - (page_offset & 3);
 
-		head->arg.page_len += pad;
+		head->arg.tail[0].iov_len += pad;
 		head->arg.len += pad;
 		head->arg.buflen += pad;
 		page_offset += pad;
@@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	return ret;
 }
 
-static int rdma_read_complete(struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *head)
+static void rdma_read_complete(struct svc_rqst *rqstp,
+			       struct svc_rdma_op_ctxt *head)
 {
 	int page_no;
-	int ret;
 
 	/* Copy RPC pages */
 	for (page_no = 0; page_no < head->count; page_no++) {
@@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 	rqstp->rq_arg.tail[0] = head->arg.tail[0];
 	rqstp->rq_arg.len = head->arg.len;
 	rqstp->rq_arg.buflen = head->arg.buflen;
-
-	/* Free the context */
-	svc_rdma_put_context(head, 0);
-
-	/* XXX: What should this be? */
-	rqstp->rq_prot = IPPROTO_MAX;
-	svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
-
-	ret = rqstp->rq_arg.head[0].iov_len
-		+ rqstp->rq_arg.page_len
-		+ rqstp->rq_arg.tail[0].iov_len;
-	dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
-		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
-		ret, rqstp->rq_arg.len,	rqstp->rq_arg.head[0].iov_base,
-		rqstp->rq_arg.head[0].iov_len);
-
-	return ret;
 }
 
 /* By convention, backchannel calls arrive via rdma_msg type
@@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 				  dto_q);
 		list_del_init(&ctxt->dto_q);
 		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
-		return rdma_read_complete(rqstp, ctxt);
+		rdma_read_complete(rqstp, ctxt);
+		goto complete;
 	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
 		ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
 				  struct svc_rdma_op_ctxt,
@@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	/* Decode the RDMA header. */
 	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-	ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
 	if (ret < 0)
 		goto out_err;
 	if (ret == 0)
@@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		return 0;
 	}
 
+complete:
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
 		+ rqstp->rq_arg.tail[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4f1b1c4f45f9..54d533300620 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      struct svc_rqst *rqstp,
 		      struct page *page,
 		      struct rpcrdma_msg *rdma_resp,
-		      struct svc_rdma_op_ctxt *ctxt,
 		      struct svc_rdma_req_map *vec,
 		      int byte_count)
 {
+	struct svc_rdma_op_ctxt *ctxt;
 	struct ib_send_wr send_wr;
 	u32 xdr_off;
 	int sge_no;
 	int sge_bytes;
 	int page_no;
 	int pages;
-	int ret;
-
-	ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
-	if (ret) {
-		svc_rdma_put_context(ctxt, 0);
-		return -ENOTCONN;
-	}
+	int ret = -EIO;
 
 	/* Prepare the context */
+	ctxt = svc_rdma_get_context(rdma);
+	ctxt->direction = DMA_TO_DEVICE;
 	ctxt->pages[0] = page;
 	ctxt->count = 1;
 
@@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
  err:
 	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
-	pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
-	return -EIO;
+	return ret;
 }
 
 void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	int ret;
 	int inline_bytes;
 	struct page *res_page;
-	struct svc_rdma_op_ctxt *ctxt;
 	struct svc_rdma_req_map *vec;
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
 
 	/* Build an req vec for the XDR */
-	ctxt = svc_rdma_get_context(rdma);
-	ctxt->direction = DMA_TO_DEVICE;
 	vec = svc_rdma_get_req_map(rdma);
 	ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
 	if (ret)
@@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 		inline_bytes -= ret;
 	}
 
-	ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+	/* Post a fresh Receive buffer _before_ sending the reply */
+	ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+	if (ret)
+		goto err1;
+
+	ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
 			 inline_bytes);
 	if (ret < 0)
 		goto err1;
@@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	put_page(res_page);
  err0:
 	svc_rdma_put_req_map(rdma, vec);
-	svc_rdma_put_context(ctxt, 0);
+	pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+	       ret);
 	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
 	return -ENOTCONN;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 90668969d559..dd9440137834 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 	int ret;
 
 	dprintk("svcrdma: Creating RDMA socket\n");
-	if (sa->sa_family != AF_INET) {
+	if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
 		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
 		return ERR_PTR(-EAFNOSUPPORT);
 	}
@@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
 		goto err0;
 	}
 
+	/* Allow both IPv4 and IPv6 sockets to bind a single port
+	 * at the same time.
+	 */
+#if IS_ENABLED(CONFIG_IPV6)
+	ret = rdma_set_afonly(listen_id, 1);
+	if (ret) {
+		dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+		goto err1;
+	}
+#endif
 	ret = rdma_bind_addr(listen_id, sa);
 	if (ret) {
 		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
@@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
 
 	/* Post receive buffers */
-	for (i = 0; i < newxprt->sc_rq_depth; i++) {
+	for (i = 0; i < newxprt->sc_max_requests; i++) {
 		ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
 		if (ret) {
 			dprintk("svcrdma: failure posting receive buffers\n");
@@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work)
 
 	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
 
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_drain_qp(rdma->sc_qp);
+
 	/* We should only be called from kref_put */
 	if (atomic_read(&xprt->xpt_ref.refcount) != 0)
 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10ea3..99d2e5b72726 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
 static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
 	},
 	{
 		.procname	= "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
 	},
 	{
 		.procname	= "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
 out:
 	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
 	req->rl_connect_cookie = 0;	/* our reserved value */
+	req->rl_task = task;
 	return req->rl_sendbuf->rg_base;
 
 out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
 	struct rpcrdma_req *req;
 	struct rpcrdma_xprt *r_xprt;
 	struct rpcrdma_regbuf *rb;
-	int i;
 
 	if (buffer == NULL)
 		return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
 
 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-	for (i = 0; req->rl_nchunks;) {
-		--req->rl_nchunks;
-		i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
-						    &req->rl_segments[i]);
-	}
+	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+					    !RPC_IS_ASYNC(req->rl_task));
 
 	rpcrdma_buffer_put(req);
 }
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_up			= xprt_rdma_bc_up,
+	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
 #endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f5ed9f982cd7..b044d98a1370 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -203,15 +203,6 @@ out_fail:
 	goto out_schedule;
 }
 
-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
-	struct ib_wc wc;
-
-	while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
-		rpcrdma_receive_wc(NULL, &wc);
-}
-
 static int
 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -374,23 +365,6 @@ out:
 }
 
 /*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
-	struct ib_wc wc;
-	int count = 0;
-
-	while (1 == ib_poll_cq(cq, 1, &wc))
-		++count;
-
-	if (count)
-		dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
-			__func__, count, wc.opcode);
-}
-
-/*
  * Exported functions.
  */
 
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
 		__func__, ia->ri_ops->ro_displayname);
 
-	rwlock_init(&ia->ri_qplock);
 	return 0;
 
 out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 			__func__);
 		return -ENOMEM;
 	}
-	max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+	max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
 
 	/* check provider's send/recv wr limits */
 	if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.srq = NULL;
 	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+	ep->rep_attr.cap.max_send_wr += 1;	/* drain cqe */
 	rc = ia->ri_ops->ro_open(ia, ep, cdata);
 	if (rc)
 		return rc;
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
 	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.recv_cq = recvcq;
 
 	/* Initialize cma parameters */
+	memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
 
 	/* RPC/RDMA does not use private data */
 	ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 		ep->rep_remote_cma.responder_resources =
 						ia->ri_device->attrs.max_qp_rd_atom;
 
-	ep->rep_remote_cma.retry_count = 7;
+	/* Limit transport retries so client can detect server
+	 * GID changes quickly. RPC layer handles re-establishing
+	 * transport connection and retransmission.
+	 */
+	ep->rep_remote_cma.retry_count = 6;
+
+	/* RPC-over-RDMA handles its own flow control. In addition,
+	 * make all RNR NAKs visible so we know that RPC-over-RDMA
+	 * flow control is working correctly (no NAKs should be seen).
+	 */
 	ep->rep_remote_cma.flow_control = 0;
 	ep->rep_remote_cma.rnr_retry_count = 0;
 
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 
 	cancel_delayed_work_sync(&ep->rep_connect_worker);
 
-	if (ia->ri_id->qp)
-		rpcrdma_ep_disconnect(ep, ia);
-
-	rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-	rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
 	if (ia->ri_id->qp) {
+		rpcrdma_ep_disconnect(ep, ia);
 		rdma_destroy_qp(ia->ri_id);
 		ia->ri_id->qp = NULL;
 	}
@@ -659,7 +639,6 @@ retry:
 		dprintk("RPC:       %s: reconnecting...\n", __func__);
 
 		rpcrdma_ep_disconnect(ep, ia);
-		rpcrdma_flush_cqs(ep);
 
 		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
 		id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
 			goto out;
 		}
 
-		write_lock(&ia->ri_qplock);
 		old = ia->ri_id;
 		ia->ri_id = id;
-		write_unlock(&ia->ri_qplock);
 
 		rdma_destroy_qp(old);
 		rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
 	int rc;
 
-	rpcrdma_flush_cqs(ep);
 	rc = rdma_disconnect(ia->ri_id);
 	if (!rc) {
 		/* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
 		ep->rep_connected = rc;
 	}
+
+	ib_drain_qp(ia->ri_id->qp);
 }
 
 struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
 	rpcrdma_recv_buffer_put(rep);
 	return rc;
 }
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
-	int bytes, segments;
-
-	bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
-	bytes -= RPCRDMA_HDRLEN_MIN;
-	if (bytes < sizeof(struct rpcrdma_segment) * 2) {
-		pr_warn("RPC:       %s: inline threshold too small\n",
-			__func__);
-		return 0;
-	}
-
-	segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
-	dprintk("RPC:       %s: max chunk list size = %d segments\n",
-		__func__, segments);
-	return segments;
-}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb96f..95cdc66225ee 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,7 +65,6 @@
  */
 struct rpcrdma_ia {
 	const struct rpcrdma_memreg_ops	*ri_ops;
-	rwlock_t		ri_qplock;
 	struct ib_device	*ri_device;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
 	struct completion	ri_done;
 	int			ri_async_rc;
 	unsigned int		ri_max_frmr_depth;
+	unsigned int		ri_max_inline_write;
+	unsigned int		ri_max_inline_read;
 	struct ib_qp_attr	ri_qp_attr;
 	struct ib_qp_init_attr	ri_qp_init_attr;
 };
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 
 #define RPCRDMA_DEF_GFP		(GFP_NOIO | __GFP_NOWARN)
 
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS	(8)
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  */
 
 #define RPCRDMA_MAX_DATA_SEGS	((1 * 1024 * 1024) / PAGE_SIZE)
-#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+/* data segments + head/tail for Call + head/tail for Reply */
+#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 4)
 
 struct rpcrdma_buffer;
 
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
 };
 
 struct rpcrdma_frmr {
-	struct scatterlist		*sg;
-	int				sg_nents;
+	struct scatterlist		*fr_sg;
+	int				fr_nents;
+	enum dma_data_direction		fr_dir;
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
 	enum rpcrdma_frmr_state		fr_state;
 	struct completion		fr_linv_done;
-	struct work_struct		fr_work;
-	struct rpcrdma_xprt		*fr_xprt;
 	union {
 		struct ib_reg_wr	fr_regwr;
 		struct ib_send_wr	fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
 		struct rpcrdma_fmr	fmr;
 		struct rpcrdma_frmr	frmr;
 	};
+	struct work_struct	mw_work;
+	struct rpcrdma_xprt	*mw_xprt;
 	struct list_head	mw_list;
 	struct list_head	mw_all;
 };
@@ -270,12 +294,14 @@ struct rpcrdma_req {
 	unsigned int		rl_niovs;
 	unsigned int		rl_nchunks;
 	unsigned int		rl_connect_cookie;
+	struct rpc_task		*rl_task;
 	struct rpcrdma_buffer	*rl_buffer;
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
 	struct ib_sge		rl_send_iov[RPCRDMA_MAX_IOVS];
 	struct rpcrdma_regbuf	*rl_rdmabuf;
 	struct rpcrdma_regbuf	*rl_sendbuf;
 	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
+	struct rpcrdma_mr_seg	*rl_nextseg;
 
 	struct ib_cqe		rl_cqe;
 	struct list_head	rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
 				  struct rpcrdma_mr_seg *, int, bool);
 	void		(*ro_unmap_sync)(struct rpcrdma_xprt *,
 					 struct rpcrdma_req *);
-	int		(*ro_unmap)(struct rpcrdma_xprt *,
-				    struct rpcrdma_mr_seg *);
+	void		(*ro_unmap_safe)(struct rpcrdma_xprt *,
+					 struct rpcrdma_req *, bool);
 	int		(*ro_open)(struct rpcrdma_ia *,
 				   struct rpcrdma_ep *,
 				   struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 			 struct rpcrdma_regbuf *);
 
-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
 int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
 int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+				  struct rpcrdma_create_data_internal *,
+				  unsigned int);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b90c5397b5e1..7e2b2fa189c3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1364,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
 		return ret;
 	return 0;
 }
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	return PAGE_SIZE;
+}
 #else
 static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
 					struct xdr_skb_reader *desc)
@@ -2661,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_up			= xs_tcp_bc_up,
+	.bc_maxpayload		= xs_tcp_bc_maxpayload,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,
 #endif
@@ -3051,6 +3057,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 		return xprt;
 
 	args->bc_xprt->xpt_bc_xprt = NULL;
+	args->bc_xprt->xpt_bc_xps = NULL;
 	xprt_put(xprt);
 	ret = ERR_PTR(-EINVAL);
 out_err:
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 4dfc5c14f8c3..3ad9fab1985f 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -346,9 +346,15 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg,
 				      struct nlattr **attrs)
 {
 	struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1];
+	int err;
+
+	if (!attrs[TIPC_NLA_BEARER])
+		return -EINVAL;
 
-	nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER],
-			 NULL);
+	err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX,
+			       attrs[TIPC_NLA_BEARER], NULL);
+	if (err)
+		return err;
 
 	return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME,
 			    nla_data(bearer[TIPC_NLA_BEARER_NAME]),
@@ -460,14 +466,31 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
 	struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
 	struct nlattr *prop[TIPC_NLA_PROP_MAX + 1];
 	struct nlattr *stats[TIPC_NLA_STATS_MAX + 1];
+	int err;
 
-	nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+	if (!attrs[TIPC_NLA_LINK])
+		return -EINVAL;
 
-	nla_parse_nested(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP],
-			 NULL);
+	err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+			       NULL);
+	if (err)
+		return err;
+
+	if (!link[TIPC_NLA_LINK_PROP])
+		return -EINVAL;
 
-	nla_parse_nested(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS],
-			 NULL);
+	err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX,
+			       link[TIPC_NLA_LINK_PROP], NULL);
+	if (err)
+		return err;
+
+	if (!link[TIPC_NLA_LINK_STATS])
+		return -EINVAL;
+
+	err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX,
+			       link[TIPC_NLA_LINK_STATS], NULL);
+	if (err)
+		return err;
 
 	name = (char *)TLV_DATA(msg->req);
 	if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
@@ -569,12 +592,20 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
 {
 	struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
 	struct tipc_link_info link_info;
+	int err;
 
-	nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+	if (!attrs[TIPC_NLA_LINK])
+		return -EINVAL;
+
+	err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+			       NULL);
+	if (err)
+		return err;
 
 	link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
 	link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
-	strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]));
+	nla_strlcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]),
+		    TIPC_MAX_LINK_NAME);
 
 	return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
 			    &link_info, sizeof(link_info));
@@ -758,12 +789,23 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg,
 	u32 node, depth, type, lowbound, upbound;
 	static const char * const scope_str[] = {"", " zone", " cluster",
 						 " node"};
+	int err;
 
-	nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
-			 attrs[TIPC_NLA_NAME_TABLE], NULL);
+	if (!attrs[TIPC_NLA_NAME_TABLE])
+		return -EINVAL;
 
-	nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL],
-			 NULL);
+	err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
+			       attrs[TIPC_NLA_NAME_TABLE], NULL);
+	if (err)
+		return err;
+
+	if (!nt[TIPC_NLA_NAME_TABLE_PUBL])
+		return -EINVAL;
+
+	err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX,
+			       nt[TIPC_NLA_NAME_TABLE_PUBL], NULL);
+	if (err)
+		return err;
 
 	ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
 
@@ -815,8 +857,15 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg,
 {
 	u32 type, lower, upper;
 	struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1];
+	int err;
 
-	nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL);
+	if (!attrs[TIPC_NLA_PUBL])
+		return -EINVAL;
+
+	err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL],
+			       NULL);
+	if (err)
+		return err;
 
 	type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]);
 	lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]);
@@ -876,7 +925,13 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg,
 	u32 sock_ref;
 	struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
 
-	nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL);
+	if (!attrs[TIPC_NLA_SOCK])
+		return -EINVAL;
+
+	err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK],
+			       NULL);
+	if (err)
+		return err;
 
 	sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
 	tipc_tlv_sprintf(msg->rep, "%u:", sock_ref);
@@ -917,9 +972,15 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg,
 				     struct nlattr **attrs)
 {
 	struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1];
+	int err;
+
+	if (!attrs[TIPC_NLA_MEDIA])
+		return -EINVAL;
 
-	nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
-			 NULL);
+	err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
+			       NULL);
+	if (err)
+		return err;
 
 	return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME,
 			    nla_data(media[TIPC_NLA_MEDIA_NAME]),
@@ -931,8 +992,15 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg,
 {
 	struct tipc_node_info node_info;
 	struct nlattr *node[TIPC_NLA_NODE_MAX + 1];
+	int err;
 
-	nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL);
+	if (!attrs[TIPC_NLA_NODE])
+		return -EINVAL;
+
+	err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE],
+			       NULL);
+	if (err)
+		return err;
 
 	node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR]));
 	node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP]));
@@ -971,8 +1039,16 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg,
 {
 	__be32 id;
 	struct nlattr *net[TIPC_NLA_NET_MAX + 1];
+	int err;
+
+	if (!attrs[TIPC_NLA_NET])
+		return -EINVAL;
+
+	err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET],
+			       NULL);
+	if (err)
+		return err;
 
-	nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL);
 	id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID]));
 
 	return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id));
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 7a0af2dc0406..272d20a795d5 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -138,28 +138,28 @@ static void sock_data_ready(struct sock *sk)
 {
 	struct tipc_conn *con;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	con = sock2con(sk);
 	if (con && test_bit(CF_CONNECTED, &con->flags)) {
 		conn_get(con);
 		if (!queue_work(con->server->rcv_wq, &con->rwork))
 			conn_put(con);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void sock_write_space(struct sock *sk)
 {
 	struct tipc_conn *con;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	con = sock2con(sk);
 	if (con && test_bit(CF_CONNECTED, &con->flags)) {
 		conn_get(con);
 		if (!queue_work(con->server->send_wq, &con->swork))
 			conn_put(con);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 80aa6a3e6817..735362c26c8e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -315,7 +315,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
 
-		if (dentry && d_backing_inode(dentry) == i) {
+		if (dentry && d_real_inode(dentry) == i) {
 			sock_hold(s);
 			goto found;
 		}
@@ -911,7 +911,7 @@ static struct sock *unix_find_other(struct net *net,
 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 		if (err)
 			goto fail;
-		inode = d_backing_inode(path.dentry);
+		inode = d_real_inode(path.dentry);
 		err = inode_permission(inode, MAY_WRITE);
 		if (err)
 			goto put_fail;
@@ -1048,7 +1048,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out_up;
 		}
 		addr->hash = UNIX_HASH_SIZE;
-		hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+		hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
 		spin_lock(&unix_table_lock);
 		u->path = u_path;
 		list = &unix_socket_table[hash];
diff --git a/net/wireless/core.c b/net/wireless/core.c
index d25c82bc1bbe..ecca3896b9f7 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -363,8 +363,6 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
 	WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel);
 	WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch);
 	WARN_ON(ops->add_tx_ts && !ops->del_tx_ts);
-	WARN_ON(ops->set_tx_power && !ops->get_tx_power);
-	WARN_ON(ops->set_antenna && !ops->get_antenna);
 
 	alloc_size = sizeof(*rdev) + sizeof_priv;
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 219bd197039e..4e809e978b7d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
 	struct skb_shared_info *sh = skb_shinfo(skb);
 	int page_offset;
 
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 	page_offset = ptr - page_address(page);
 	skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
 }
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..dbb2738e356a 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -958,8 +958,29 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
 			return private(dev, iwr, cmd, info, handler);
 	}
 	/* Old driver API : call driver ioctl handler */
-	if (dev->netdev_ops->ndo_do_ioctl)
-		return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+	if (dev->netdev_ops->ndo_do_ioctl) {
+#ifdef CONFIG_COMPAT
+		if (info->flags & IW_REQUEST_FLAG_COMPAT) {
+			int ret = 0;
+			struct iwreq iwr_lcl;
+			struct compat_iw_point *iwp_compat = (void *) &iwr->u.data;
+
+			memcpy(&iwr_lcl, iwr, sizeof(struct iwreq));
+			iwr_lcl.u.data.pointer = compat_ptr(iwp_compat->pointer);
+			iwr_lcl.u.data.length = iwp_compat->length;
+			iwr_lcl.u.data.flags = iwp_compat->flags;
+
+			ret = dev->netdev_ops->ndo_do_ioctl(dev, (void *) &iwr_lcl, cmd);
+
+			iwp_compat->pointer = ptr_to_compat(iwr_lcl.u.data.pointer);
+			iwp_compat->length = iwr_lcl.u.data.length;
+			iwp_compat->flags = iwr_lcl.u.data.flags;
+
+			return ret;
+		} else
+#endif
+			return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+	}
 	return -EOPNOTSUPP;
 }