summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c5
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c20
-rw-r--r--net/9p/client.c8
-rw-r--r--net/atm/signaling.c2
-rw-r--r--net/atm/svc.c4
-rw-r--r--net/batman-adv/bat_iv_ogm.c9
-rw-r--r--net/batman-adv/bat_v.c66
-rw-r--r--net/batman-adv/bat_v_elp.c31
-rw-r--r--net/batman-adv/bat_v_elp.h2
-rw-r--r--net/batman-adv/originator.c6
-rw-r--r--net/batman-adv/routing.c4
-rw-r--r--net/bridge/br_fdb.c2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_strings.c16
-rw-r--r--net/ceph/debugfs.c147
-rw-r--r--net/ceph/mon_client.c393
-rw-r--r--net/ceph/osd_client.c4031
-rw-r--r--net/ceph/osdmap.c651
-rw-r--r--net/compat.c20
-rw-r--r--net/core/ethtool.c4
-rw-r--r--net/core/gen_stats.c2
-rw-r--r--net/core/hwbm.c3
-rw-r--r--net/core/net-sysfs.c1
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/dns_resolver/dns_key.c2
-rw-r--r--net/ieee802154/nl802154.c4
-rw-r--r--net/ipv4/af_inet.c40
-rw-r--r--net/ipv4/fou.c144
-rw-r--r--net/ipv4/gre_offload.c14
-rw-r--r--net/ipv4/ip_tunnel.c45
-rw-r--r--net/ipv4/ip_tunnel_core.c9
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp_offload.c19
-rw-r--r--net/ipv4/udp.c12
-rw-r--r--net/ipv4/udp_offload.c10
-rw-r--r--net/ipv6/Kconfig9
-rw-r--r--net/ipv6/Makefile1
-rw-r--r--net/ipv6/fou6.c140
-rw-r--r--net/ipv6/ip6_gre.c97
-rw-r--r--net/ipv6/ip6_input.c33
-rw-r--r--net/ipv6/ip6_offload.c77
-rw-r--r--net/ipv6/ip6_output.c11
-rw-r--r--net/ipv6/ip6_tunnel.c190
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c1
-rw-r--r--net/ipv6/sit.c4
-rw-r--r--net/ipv6/tcp_ipv6.c4
-rw-r--r--net/ipv6/udp.c14
-rw-r--r--net/ipv6/udp_offload.c13
-rw-r--r--net/irda/ircomm/ircomm_tty.c31
-rw-r--r--net/irda/ircomm/ircomm_tty_attach.c2
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c18
-rw-r--r--net/kcm/kcmsock.c2
-rw-r--r--net/l2tp/l2tp_core.c2
-rw-r--r--net/l2tp/l2tp_ip6.c12
-rw-r--r--net/lapb/lapb_in.c5
-rw-r--r--net/lapb/lapb_out.c4
-rw-r--r--net/lapb/lapb_subr.c14
-rw-r--r--net/mac80211/mesh.c4
-rw-r--r--net/mac80211/sta_info.h2
-rw-r--r--net/mpls/mpls_gso.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c17
-rw-r--r--net/netfilter/nf_conntrack_ftp.c1
-rw-r--r--net/netfilter/nf_conntrack_helper.c9
-rw-r--r--net/netfilter/nf_conntrack_irc.c1
-rw-r--r--net/netfilter/nf_conntrack_sane.c1
-rw-r--r--net/netfilter/nf_conntrack_sip.c1
-rw-r--r--net/netfilter/nf_conntrack_standalone.c2
-rw-r--r--net/netfilter/nf_conntrack_tftp.c1
-rw-r--r--net/netfilter/nf_queue.c17
-rw-r--r--net/netfilter/nf_tables_api.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c20
-rw-r--r--net/netfilter/x_tables.c4
-rw-r--r--net/netlabel/netlabel_kapi.c2
-rw-r--r--net/openvswitch/actions.c20
-rw-r--r--net/packet/af_packet.c25
-rw-r--r--net/rds/ib_frmr.c2
-rw-r--r--net/rds/rds.h2
-rw-r--r--net/rds/recv.c2
-rw-r--r--net/rds/send.c1
-rw-r--r--net/rds/tcp.c78
-rw-r--r--net/rds/tcp.h1
-rw-r--r--net/rds/tcp_connect.c6
-rw-r--r--net/rds/tcp_listen.c31
-rw-r--r--net/rds/tcp_recv.c4
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c10
-rw-r--r--net/rxrpc/ar-key.c4
-rw-r--r--net/rxrpc/rxkad.c4
-rw-r--r--net/sched/act_police.c40
-rw-r--r--net/sched/cls_flower.c6
-rw-r--r--net/sched/cls_u32.c72
-rw-r--r--net/sched/sch_api.c4
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_fq_codel.c26
-rw-r--r--net/sched/sch_generic.c2
-rw-r--r--net/sched/sch_hfsc.c12
-rw-r--r--net/sched/sch_htb.c13
-rw-r--r--net/sched/sch_ingress.c12
-rw-r--r--net/sched/sch_prio.c4
-rw-r--r--net/sched/sch_qfq.c6
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_tbf.c4
-rw-r--r--net/sctp/sctp_diag.c3
-rw-r--r--net/sctp/socket.c1
-rw-r--r--net/socket.c8
-rw-r--r--net/sunrpc/auth.c9
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c6
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c9
-rw-r--r--net/sunrpc/auth_unix.c6
-rw-r--r--net/sunrpc/clnt.c48
-rw-r--r--net/sunrpc/svc_xprt.c25
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c16
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c134
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c214
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c39
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c517
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c32
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c36
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c28
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c17
-rw-r--r--net/sunrpc/xprtrdma/transport.c16
-rw-r--r--net/sunrpc/xprtrdma/verbs.c78
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h47
-rw-r--r--net/sunrpc/xprtsock.c7
-rw-r--r--net/tipc/netlink_compat.c114
-rw-r--r--net/tipc/server.c8
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/wireless/core.c2
-rw-r--r--net/wireless/util.c2
-rw-r--r--net/wireless/wext-core.c25
136 files changed, 5584 insertions, 2820 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a1e273af6fc8..82a116ba590e 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -290,6 +290,10 @@ static void vlan_sync_address(struct net_device *dev,
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
+ /* vlan continues to inherit address of lower device */
+ if (vlan_dev_inherit_address(vlandev, dev))
+ goto out;
+
/* vlan address was different from the old address and is equal to
* the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -302,6 +306,7 @@ static void vlan_sync_address(struct net_device *dev,
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
}
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 9d010a09ab98..cc1557978066 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_device *real_dev,
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev);
static inline u32 vlan_get_ingress_priority(struct net_device *dev,
u16 vlan_tci)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e7e62570bdb8..86ae75b77390 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -245,6 +245,17 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
}
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev)
+{
+ if (dev->addr_assign_type != NET_ADDR_STOLEN)
+ return false;
+
+ ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return true;
+}
+
static int vlan_dev_open(struct net_device *dev)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -255,7 +266,8 @@ static int vlan_dev_open(struct net_device *dev)
!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
return -ENETDOWN;
- if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+ !vlan_dev_inherit_address(dev, real_dev)) {
err = dev_uc_add(real_dev, dev->dev_addr);
if (err < 0)
goto out;
@@ -560,8 +572,10 @@ static int vlan_dev_init(struct net_device *dev)
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
- if (is_zero_ether_addr(dev->dev_addr))
- eth_hw_addr_inherit(dev, real_dev);
+ if (is_zero_ether_addr(dev->dev_addr)) {
+ ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ dev->addr_assign_type = NET_ADDR_STOLEN;
+ }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
diff --git a/net/9p/client.c b/net/9p/client.c
index ea79ee9a7348..3fc94a49ccd5 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
if (err)
goto out_err;
- if (p9_is_proto_dotu(c))
+ if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
- if (!err || !IS_ERR_VALUE(err)) {
+ if (!err) {
err = p9_errstr2errno(ename, strlen(ename));
p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
@@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
if (err)
goto out_err;
- if (p9_is_proto_dotu(c))
+ if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
- if (!err || !IS_ERR_VALUE(err)) {
+ if (!err) {
err = p9_errstr2errno(ename, strlen(ename));
p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 4fd6af47383a..adb6e3d21b1e 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -124,7 +124,7 @@ as_indicate_complete:
break;
case as_addparty:
case as_dropparty:
- sk->sk_err_soft = msg->reply;
+ sk->sk_err_soft = -msg->reply;
/* < 0 failure, otherwise ep_ref */
clear_bit(ATM_VF_WAITING, &vcc->flags);
break;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 3fa0a9ee98d1..878563a8354d 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -546,7 +546,7 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
schedule();
}
finish_wait(sk_sleep(sk), &wait);
- error = xchg(&sk->sk_err_soft, 0);
+ error = -xchg(&sk->sk_err_soft, 0);
out:
release_sock(sk);
return error;
@@ -573,7 +573,7 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
error = -EUNATCH;
goto out;
}
- error = xchg(&sk->sk_err_soft, 0);
+ error = -xchg(&sk->sk_err_soft, 0);
out:
release_sock(sk);
return error;
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 7f98a9d39883..ce2f203048d3 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -157,10 +157,8 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
orig_node->bat_iv.bcast_own = data_ptr;
data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr) {
- kfree(orig_node->bat_iv.bcast_own);
+ if (!data_ptr)
goto unlock;
- }
memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
(max_if_num - 1) * sizeof(u8));
@@ -1182,9 +1180,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
u8 total_count;
u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
- int tq_asym_penalty, inv_asym_penalty, if_num;
+ int if_num;
+ unsigned int tq_asym_penalty, inv_asym_penalty;
unsigned int combined_tq;
- int tq_iface_penalty;
+ unsigned int tq_iface_penalty;
bool ret = false;
/* find corresponding one hop neighbor */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 3ff8bd1b7bdc..0a12e5cdd65d 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -27,6 +27,7 @@
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
+#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -39,6 +40,16 @@
static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (primary_if) {
+ batadv_v_elp_iface_activate(primary_if, hard_iface);
+ batadv_hardif_put(primary_if);
+ }
+
/* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can
* set the interface as ACTIVE right away, without any risk of race
* condition
@@ -72,16 +83,34 @@ static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
batadv_v_elp_iface_disable(hard_iface);
}
-static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
-{
-}
-
static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
{
batadv_v_elp_primary_iface_set(hard_iface);
batadv_v_ogm_primary_iface_set(hard_iface);
}
+/**
+ * batadv_v_iface_update_mac - react to hard-interface MAC address change
+ * @hard_iface: the modified interface
+ *
+ * If the modified interface is the primary one, update the originator
+ * address in the ELP and OGM messages to reflect the new MAC address.
+ */
+static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if != hard_iface)
+ goto out;
+
+ batadv_v_primary_iface_set(hard_iface);
+out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+}
+
static void
batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
{
@@ -255,14 +284,23 @@ static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing2)
{
struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+ int ret = 0;
ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ if (WARN_ON(!ifinfo1))
+ goto err_ifinfo1;
+
ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ if (WARN_ON(!ifinfo2))
+ goto err_ifinfo2;
- if (WARN_ON(!ifinfo1 || !ifinfo2))
- return 0;
+ ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
- return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
+ batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+ batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+ return ret;
}
static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
@@ -272,14 +310,26 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
{
struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
u32 threshold;
+ bool ret = false;
ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ if (WARN_ON(!ifinfo1))
+ goto err_ifinfo1;
+
ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+ if (WARN_ON(!ifinfo2))
+ goto err_ifinfo2;
threshold = ifinfo1->bat_v.throughput / 4;
threshold = ifinfo1->bat_v.throughput - threshold;
- return ifinfo2->bat_v.throughput > threshold;
+ ret = ifinfo2->bat_v.throughput > threshold;
+
+ batadv_neigh_ifinfo_put(ifinfo2);
+err_ifinfo2:
+ batadv_neigh_ifinfo_put(ifinfo1);
+err_ifinfo1:
+ return ret;
}
static struct batadv_algo_ops batadv_batman_v __read_mostly = {
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 3844e7efd0b0..df42eb1365a0 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -377,6 +377,27 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
}
/**
+ * batadv_v_elp_iface_activate - update the ELP buffer belonging to the given
+ * hard-interface
+ * @primary_iface: the new primary interface
+ * @hard_iface: interface holding the to-be-updated buffer
+ */
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_elp_packet *elp_packet;
+ struct sk_buff *skb;
+
+ if (!hard_iface->bat_v.elp_skb)
+ return;
+
+ skb = hard_iface->bat_v.elp_skb;
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+ ether_addr_copy(elp_packet->orig,
+ primary_iface->net_dev->dev_addr);
+}
+
+/**
* batadv_v_elp_primary_iface_set - change internal data to reflect the new
* primary interface
* @primary_iface: the new primary interface
@@ -384,8 +405,6 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
{
struct batadv_hard_iface *hard_iface;
- struct batadv_elp_packet *elp_packet;
- struct sk_buff *skb;
/* update orig field of every elp iface belonging to this mesh */
rcu_read_lock();
@@ -393,13 +412,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
if (primary_iface->soft_iface != hard_iface->soft_iface)
continue;
- if (!hard_iface->bat_v.elp_skb)
- continue;
-
- skb = hard_iface->bat_v.elp_skb;
- elp_packet = (struct batadv_elp_packet *)skb->data;
- ether_addr_copy(elp_packet->orig,
- primary_iface->net_dev->dev_addr);
+ batadv_v_elp_iface_activate(primary_iface, hard_iface);
}
rcu_read_unlock();
}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index e95f1bca0785..cc130b2d05e5 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -25,6 +25,8 @@ struct work_struct;
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
+ struct batadv_hard_iface *hard_iface);
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 1ff4ee473966..7f51bc2c06eb 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -619,6 +619,8 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
struct batadv_neigh_node *neigh_node;
struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
if (neigh_node)
goto out;
@@ -650,15 +652,15 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
kref_init(&neigh_node->refcount);
kref_get(&neigh_node->refcount);
- spin_lock_bh(&orig_node->neigh_list_lock);
hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
- spin_unlock_bh(&orig_node->neigh_list_lock);
batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
"Creating new neighbor %pM for orig_node %pM on interface %s\n",
neigh_addr, orig_node->orig, hard_iface->net_dev->name);
out:
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
if (hardif_neigh)
batadv_hardif_neigh_put(hardif_neigh);
return neigh_node;
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index ae850f2d11cb..e3857ed4057f 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -601,6 +601,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr = eth_hdr(skb);
int res, hdr_len, ret = NET_RX_DROP;
+ unsigned int len;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -641,6 +642,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
if (hdr_len > 0)
batadv_skb_set_priority(skb, hdr_len);
+ len = skb->len;
res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
/* translate transmit result into receive result */
@@ -648,7 +650,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
/* skb was transmitted and consumed */
batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
- skb->len + ETH_HLEN);
+ len + ETH_HLEN);
ret = NET_RX_SUCCESS;
} else if (res == NET_XMIT_POLICED) {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index dcea4f4c62b3..c18080ad4085 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -279,6 +279,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
* change from under us.
*/
list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
if (f && f->is_local && !f->dst)
fdb_delete_local(br, NULL, f);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
/*
* true if we have the mon map (and have thus joined the cluster)
*/
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
{
return client->monc.monmap && client->monc.monmap->epoch &&
client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
}
}
+const char *ceph_osd_watch_op_name(int o)
+{
+ switch (o) {
+ case CEPH_OSD_WATCH_OP_UNWATCH:
+ return "unwatch";
+ case CEPH_OSD_WATCH_OP_WATCH:
+ return "watch";
+ case CEPH_OSD_WATCH_OP_RECONNECT:
+ return "reconnect";
+ case CEPH_OSD_WATCH_OP_PING:
+ return "ping";
+ default:
+ return "???";
+ }
+}
+
const char *ceph_osd_state_name(int s)
{
switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_client *client = s->private;
- struct ceph_osdmap *map = client->osdc.osdmap;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
struct rb_node *n;
if (map == NULL)
return 0;
- seq_printf(s, "epoch %d\n", map->epoch);
- seq_printf(s, "flags%s%s\n",
- (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
- (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
+ down_read(&osdc->lock);
+ seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
- struct ceph_pg_pool_info *pool =
+ struct ceph_pg_pool_info *pi =
rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
- pool->id, pool->pg_num, pool->pg_num_mask,
- pool->read_tier, pool->write_tier);
+ seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+ pi->id, pi->name, pi->type, pi->size, pi->min_size,
+ pi->pg_num, pi->pg_num_mask, pi->flags,
+ pi->last_force_request_resend, pi->read_tier,
+ pi->write_tier);
}
for (i = 0; i < map->max_osd; i++) {
struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
pg->pgid.seed, pg->primary_temp.osd);
}
+ up_read(&osdc->lock);
return 0;
}
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
seq_putc(s, '\n');
}
+ seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
return 0;
}
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
{
- struct ceph_client *client = s->private;
- struct ceph_osd_client *osdc = &client->osdc;
- struct rb_node *p;
+ int i;
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- struct ceph_osd_request *req;
- unsigned int i;
- int opcode;
+ seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+ for (i = 0; i < t->up.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+ seq_printf(s, "]/%d\t[", t->up.primary);
+ for (i = 0; i < t->acting.size; i++)
+ seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+ seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ if (t->paused)
+ seq_puts(s, "\tP");
+}
- req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+ int i;
- seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1,
- req->r_pgid.pool, req->r_pgid.seed);
+ seq_printf(s, "%llu\t", req->r_tid);
+ dump_target(s, &req->r_t);
- seq_printf(s, "%.*s", req->r_base_oid.name_len,
- req->r_base_oid.name);
+ seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+ le32_to_cpu(req->r_replay_version.epoch),
+ le64_to_cpu(req->r_replay_version.version));
- if (req->r_reassert_version.epoch)
- seq_printf(s, "\t%u'%llu",
- (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
- le64_to_cpu(req->r_reassert_version.version));
- else
- seq_printf(s, "\t");
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+ ceph_osd_op_name(op->op));
+ if (op->op == CEPH_OSD_OP_WATCH)
+ seq_printf(s, "-%s",
+ ceph_osd_watch_op_name(op->watch.op));
+ }
+
+ seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ dump_request(s, req);
+ }
+
+ mutex_unlock(&osd->lock);
+}
- for (i = 0; i < req->r_num_ops; i++) {
- opcode = req->r_ops[i].op;
- seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
- ceph_osd_op_name(opcode));
- }
+static void dump_linger_request(struct seq_file *s,
+ struct ceph_osd_linger_request *lreq)
+{
+ seq_printf(s, "%llu\t", lreq->linger_id);
+ dump_target(s, &lreq->t);
+
+ seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+ lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+ lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ dump_linger_request(s, lreq);
+ }
+
+ mutex_unlock(&osd->lock);
+}
- seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *n;
+
+ down_read(&osdc->lock);
+ seq_printf(s, "REQUESTS %d homeless %d\n",
+ atomic_read(&osdc->num_requests),
+ atomic_read(&osdc->num_homeless));
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_requests(s, osd);
}
- mutex_unlock(&osdc->request_mutex);
+ dump_requests(s, &osdc->homeless_osd);
+
+ seq_puts(s, "LINGER REQUESTS\n");
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_linger_requests(s, osd);
+ }
+ dump_linger_requests(s, &osdc->homeless_osd);
+
+ up_read(&osdc->lock);
return 0;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
BUG_ON(num < 1); /* monmap sub is always there */
ceph_encode_32(&p, num);
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
- const char *s = ceph_sub_str[i];
+ char buf[32];
+ int len;
if (!monc->subs[i].want)
continue;
- dout("%s %s start %llu flags 0x%x\n", __func__, s,
+ len = sprintf(buf, "%s", ceph_sub_str[i]);
+ if (i == CEPH_SUB_MDSMAP &&
+ monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+ len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+ dout("%s %s start %llu flags 0x%x\n", __func__, buf,
le64_to_cpu(monc->subs[i].item.start),
monc->subs[i].item.flags);
- ceph_encode_string(&p, end, s, strlen(s));
+ ceph_encode_string(&p, end, buf, len);
memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
p += sizeof(monc->subs[i].item);
}
- BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+ BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
}
EXPORT_SYMBOL(ceph_monc_got_map);
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
{
- dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex);
- if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
- monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
- __send_subscribe(monc);
+ __send_subscribe(monc);
mutex_unlock(&monc->mutex);
}
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
/*
* Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
/*
* generic requests (currently statfs, mon_get_version)
*/
-static struct ceph_mon_generic_request *__lookup_generic_req(
- struct ceph_mon_client *monc, u64 tid)
-{
- struct ceph_mon_generic_request *req;
- struct rb_node *n = monc->generic_request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mon_generic_request, node);
- if (tid < req->tid)
- n = n->rb_left;
- else if (tid > req->tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *new)
-{
- struct rb_node **p = &monc->generic_request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mon_generic_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mon_generic_request, node);
- if (new->tid < req->tid)
- p = &(*p)->rb_left;
- else if (new->tid > req->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
static void release_generic_request(struct kref *kref)
{
struct ceph_mon_generic_request *req =
container_of(kref, struct ceph_mon_generic_request, kref);
+ dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+ req->reply);
+ WARN_ON(!RB_EMPTY_NODE(&req->node));
+
if (req->reply)
ceph_msg_put(req->reply);
if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
static void put_generic_request(struct ceph_mon_generic_request *req)
{
- kref_put(&req->kref, release_generic_request);
+ if (req)
+ kref_put(&req->kref, release_generic_request);
}
static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
kref_get(&req->kref);
}
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = kzalloc(sizeof(*req), gfp);
+ if (!req)
+ return NULL;
+
+ req->monc = monc;
+ kref_init(&req->kref);
+ RB_CLEAR_NODE(&req->node);
+ init_completion(&req->completion);
+
+ dout("%s greq %p\n", __func__, req);
+ return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ WARN_ON(req->tid);
+
+ get_generic_request(req);
+ req->tid = ++monc->last_tid;
+ insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *req)
+{
+ WARN_ON(!req->tid);
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ erase_generic_request(&monc->generic_request_tree, req);
+
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+ __finish_generic_request(req);
+ put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+ if (req->complete_cb)
+ req->complete_cb(req);
+ else
+ complete_all(&req->completion);
+ put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+ struct ceph_mon_client *monc = req->monc;
+ struct ceph_mon_generic_request *lookup_req;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+ mutex_lock(&monc->mutex);
+ lookup_req = lookup_generic_request(&monc->generic_request_tree,
+ req->tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ finish_generic_request(req);
+ }
+
+ mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+ int ret;
+
+ dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+ ret = wait_for_completion_interruptible(&req->completion);
+ if (ret)
+ cancel_generic_request(req);
+ else
+ ret = req->result; /* completed */
+
+ return ret;
+}
+
static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg *m;
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
if (!req) {
dout("get_generic_reply %lld dne\n", tid);
*skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
return m;
}
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- /* register request */
- req->tid = tid != 0 ? tid : ++monc->last_tid;
- req->request->hdr.tid = cpu_to_le64(req->tid);
- __insert_generic_request(monc, req);
- monc->num_generic_requests++;
- ceph_con_send(&monc->con, ceph_msg_get(req->request));
- mutex_unlock(&monc->mutex);
-
- err = wait_for_completion_interruptible(&req->completion);
-
- mutex_lock(&monc->mutex);
- rb_erase(&req->node, &monc->generic_request_tree);
- monc->num_generic_requests--;
-
- if (!err)
- err = req->result;
- return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- mutex_lock(&monc->mutex);
- err = __do_generic_request(monc, 0, req);
- mutex_unlock(&monc->mutex);
-
- return err;
-}
-
/*
* statfs
*/
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
u64 tid = le64_to_cpu(msg->hdr.tid);
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
if (msg->front.iov_len != sizeof(*reply))
goto bad;
- dout("handle_statfs_reply %p tid %llu\n", msg, tid);
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (req) {
- *(struct ceph_statfs *)req->buf = reply->st;
- req->result = 0;
- get_generic_request(req);
+ req = lookup_generic_request(&monc->generic_request_tree, tid);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
}
+
+ req->result = 0;
+ *req->u.st = reply->st; /* struct */
+ __finish_generic_request(req);
mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
+
+ complete_generic_request(req);
return;
bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h;
- int err;
+ int ret = -ENOMEM;
- req = kzalloc(sizeof(*req), GFP_NOFS);
+ req = alloc_generic_request(monc, GFP_NOFS);
if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = buf;
- init_completion(&req->completion);
+ goto out;
- err = -ENOMEM;
req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
true);
if (!req->request)
goto out;
- req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
- true);
+
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
if (!req->reply)
goto out;
+ req->u.st = buf;
+
+ mutex_lock(&monc->mutex);
+ register_generic_request(req);
/* fill out request */
h = req->request->front.iov_base;
h->monhdr.have_version = 0;
h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
- err = do_generic_request(monc, req);
-
+ ret = wait_generic_request(req);
out:
put_generic_request(req);
- return err;
+ return ret;
}
EXPORT_SYMBOL(ceph_monc_do_statfs);
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
void *end = p + msg->front_alloc_len;
u64 handle;
- dout("%s %p tid %llu\n", __func__, msg, tid);
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
ceph_decode_need(&p, end, 2*sizeof(u64), bad);
handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
goto bad;
mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, handle);
- if (req) {
- *(u64 *)req->buf = ceph_decode_64(&p);
- req->result = 0;
- get_generic_request(req);
+ req = lookup_generic_request(&monc->generic_request_tree, handle);
+ if (!req) {
+ mutex_unlock(&monc->mutex);
+ return;
}
+
+ req->result = 0;
+ req->u.newest = ceph_decode_64(&p);
+ __finish_generic_request(req);
mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
+ complete_generic_request(req);
return;
+
bad:
pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
ceph_msg_dump(msg);
}
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
- u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
{
struct ceph_mon_generic_request *req;
- void *p, *end;
- u64 tid;
- int err;
- req = kzalloc(sizeof(*req), GFP_NOFS);
+ req = alloc_generic_request(monc, GFP_NOIO);
if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = newest;
- init_completion(&req->completion);
+ goto err_put_req;
req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
sizeof(u64) + sizeof(u32) + strlen(what),
- GFP_NOFS, true);
- if (!req->request) {
- err = -ENOMEM;
- goto out;
- }
+ GFP_NOIO, true);
+ if (!req->request)
+ goto err_put_req;
- req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
- GFP_NOFS, true);
- if (!req->reply) {
- err = -ENOMEM;
- goto out;
- }
+ req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+ true);
+ if (!req->reply)
+ goto err_put_req;
- p = req->request->front.iov_base;
- end = p + req->request->front_alloc_len;
+ req->complete_cb = cb;
+ req->private_data = private_data;
- /* fill out request */
mutex_lock(&monc->mutex);
- tid = ++monc->last_tid;
- ceph_encode_64(&p, tid); /* handle */
- ceph_encode_string(&p, end, what, strlen(what));
+ register_generic_request(req);
+ {
+ void *p = req->request->front.iov_base;
+ void *const end = p + req->request->front_alloc_len;
+
+ ceph_encode_64(&p, req->tid); /* handle */
+ ceph_encode_string(&p, end, what, strlen(what));
+ WARN_ON(p != end);
+ }
+ send_generic_request(monc, req);
+ mutex_unlock(&monc->mutex);
- err = __do_generic_request(monc, tid, req);
+ return req;
- mutex_unlock(&monc->mutex);
-out:
+err_put_req:
put_generic_request(req);
- return err;
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest)
+{
+ struct ceph_mon_generic_request *req;
+ int ret;
+
+ req = __ceph_monc_get_version(monc, what, NULL, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = wait_generic_request(req);
+ if (!ret)
+ *newest = req->u.newest;
+
+ put_generic_request(req);
+ return ret;
}
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data)
+{
+ struct ceph_mon_generic_request *req;
+
+ req = __ceph_monc_get_version(monc, what, cb, private_data);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ put_generic_request(req);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
/*
* Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
if (!monc->m_subscribe_ack)
goto out_auth;
- monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
true);
if (!monc->m_subscribe)
goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT;
- monc->num_generic_requests = 0;
monc->last_tid = 0;
+ monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
return 0;
out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
ceph_auth_destroy(monc->auth);
+ WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
ceph_msg_put(monc->m_auth);
ceph_msg_put(monc->m_auth_reply);
ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..89469592076c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
-#define OSD_OP_FRONT_LEN 4096
#define OSD_OPREPLY_FRONT_LEN 512
static struct kmem_cache *ceph_osd_request_cache;
static const struct ceph_connection_operations osd_con_ops;
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-
/*
* Implement client access to distributed object storage cluster.
*
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
* channel with an OSD is reset.
*/
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+ bool wrlocked = true;
+
+ if (unlikely(down_read_trylock(sem))) {
+ wrlocked = false;
+ up_read(sem);
+ }
+
+ return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+ WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ WARN_ON(!(mutex_is_locked(&osd->lock) &&
+ rwsem_is_locked(&osdc->lock)) &&
+ !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+ WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
/*
* calculate the mapping of a file extent onto an object, and fill out the
* request accordingly. shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data);
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
- unsigned int which)
-{
- return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
-
void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
unsigned int which, struct page **pages,
u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_pagelist_init(osd_data, pagelist);
+ osd_req->r_ops[which].cls.indata_len += pagelist->length;
+ osd_req->r_ops[which].indata_len += pagelist->length;
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
pages_from_pool, own_pages);
+ osd_req->r_ops[which].cls.indata_len += length;
+ osd_req->r_ops[which].indata_len += length;
}
EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_STAT:
ceph_osd_data_release(&op->raw_data_in);
break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osd_data_release(&op->notify_ack.request_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osd_data_release(&op->notify.request_data);
+ ceph_osd_data_release(&op->notify.response_data);
+ break;
default:
break;
}
}
/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+ ceph_oid_init(&t->base_oid);
+ ceph_oloc_init(&t->base_oloc);
+ ceph_oid_init(&t->target_oid);
+ ceph_oloc_init(&t->target_oloc);
+
+ ceph_osds_init(&t->acting);
+ ceph_osds_init(&t->up);
+ t->size = -1;
+ t->min_size = -1;
+
+ t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+ const struct ceph_osd_request_target *src)
+{
+ ceph_oid_copy(&dest->base_oid, &src->base_oid);
+ ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+ ceph_oid_copy(&dest->target_oid, &src->target_oid);
+ ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+ dest->pgid = src->pgid; /* struct */
+ dest->pg_num = src->pg_num;
+ dest->pg_num_mask = src->pg_num_mask;
+ ceph_osds_copy(&dest->acting, &src->acting);
+ ceph_osds_copy(&dest->up, &src->up);
+ dest->size = src->size;
+ dest->min_size = src->min_size;
+ dest->sort_bitwise = src->sort_bitwise;
+
+ dest->flags = src->flags;
+ dest->paused = src->paused;
+
+ dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+ ceph_oid_destroy(&t->base_oid);
+ ceph_oid_destroy(&t->target_oid);
+}
+
+/*
* requests
*/
+static void request_release_checks(struct ceph_osd_request *req)
+{
+ WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+ WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+ WARN_ON(!list_empty(&req->r_unsafe_item));
+ WARN_ON(req->r_osd);
+}
+
static void ceph_osdc_release_request(struct kref *kref)
{
struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
req->r_request, req->r_reply);
- WARN_ON(!RB_EMPTY_NODE(&req->r_node));
- WARN_ON(!list_empty(&req->r_req_lru_item));
- WARN_ON(!list_empty(&req->r_osd_item));
- WARN_ON(!list_empty(&req->r_linger_item));
- WARN_ON(!list_empty(&req->r_linger_osd_item));
- WARN_ON(req->r_osd);
+ request_release_checks(req);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
- ceph_msg_revoke_incoming(req->r_reply);
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- }
for (which = 0; which < req->r_num_ops; which++)
osd_req_op_data_release(req, which);
+ target_destroy(&req->r_t);
ceph_put_snap_context(req->r_snapc);
+
if (req->r_mempool)
mempool_free(req, req->r_osdc->req_mempool);
else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
void ceph_osdc_put_request(struct ceph_osd_request *req)
{
- dout("%s %p (was %d)\n", __func__, req,
- atomic_read(&req->r_kref.refcount));
- kref_put(&req->r_kref, ceph_osdc_release_request);
+ if (req) {
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+ }
}
EXPORT_SYMBOL(ceph_osdc_put_request);
+static void request_init(struct ceph_osd_request *req)
+{
+ /* req only, each op is zeroed in _osd_req_op_init() */
+ memset(req, 0, sizeof(*req));
+
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ RB_CLEAR_NODE(&req->r_node);
+ RB_CLEAR_NODE(&req->r_mc_node);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+
+ target_init(&req->r_t);
+}
+
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable. Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ bool mempool = req->r_mempool;
+ unsigned int num_ops = req->r_num_ops;
+ u64 snapid = req->r_snapid;
+ struct ceph_snap_context *snapc = req->r_snapc;
+ bool linger = req->r_linger;
+ struct ceph_msg *request_msg = req->r_request;
+ struct ceph_msg *reply_msg = req->r_reply;
+
+ dout("%s req %p\n", __func__, req);
+ WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+ request_release_checks(req);
+
+ WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+ WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+ target_destroy(&req->r_t);
+
+ request_init(req);
+ req->r_osdc = osdc;
+ req->r_mempool = mempool;
+ req->r_num_ops = num_ops;
+ req->r_snapid = snapid;
+ req->r_snapc = snapc;
+ req->r_linger = linger;
+ req->r_request = request_msg;
+ req->r_reply = reply_msg;
+}
+
struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
gfp_t gfp_flags)
{
struct ceph_osd_request *req;
- struct ceph_msg *msg;
- size_t msg_size;
if (use_mempool) {
BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
if (unlikely(!req))
return NULL;
- /* req only, each op is zeroed in _osd_req_op_init() */
- memset(req, 0, sizeof(*req));
-
+ request_init(req);
req->r_osdc = osdc;
req->r_mempool = use_mempool;
req->r_num_ops = num_ops;
+ req->r_snapid = CEPH_NOSNAP;
+ req->r_snapc = ceph_get_snap_context(snapc);
- kref_init(&req->r_kref);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- RB_CLEAR_NODE(&req->r_node);
- INIT_LIST_HEAD(&req->r_unsafe_item);
- INIT_LIST_HEAD(&req->r_linger_item);
- INIT_LIST_HEAD(&req->r_linger_osd_item);
- INIT_LIST_HEAD(&req->r_req_lru_item);
- INIT_LIST_HEAD(&req->r_osd_item);
-
- req->r_base_oloc.pool = -1;
- req->r_target_oloc.pool = -1;
+ dout("%s req %p\n", __func__, req);
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
- msg_size = OSD_OPREPLY_FRONT_LEN;
- if (num_ops > CEPH_OSD_SLAB_OPS) {
- /* ceph_osd_op and rval */
- msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
- (sizeof(struct ceph_osd_op) + 4);
- }
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_msg *msg;
+ int msg_size;
- /* create reply message */
- if (use_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
- else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
- gfp_flags, true);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
- req->r_reply = msg;
+ WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ /* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
- msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
- msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+ msg_size += 4 + req->r_base_oid.name_len; /* oid */
+ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
- msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
msg_size += 4; /* retry_attempt */
- /* create request message; allow space for oid */
- if (use_mempool)
+ if (req->r_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+ if (!msg)
+ return -ENOMEM;
memset(msg->front.iov_base, 0, msg->front.iov_len);
-
req->r_request = msg;
- return req;
+ /* create reply message */
+ msg_size = OSD_OPREPLY_FRONT_LEN;
+ msg_size += req->r_base_oid.name_len;
+ msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+ if (req->r_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+ if (!msg)
+ return -ENOMEM;
+
+ req->r_reply = msg;
+
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
static bool osd_req_opcode_valid(u16 opcode)
{
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
- op->cls.argc = 0; /* currently unused */
-
op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode,
- u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+ u64 cookie, u8 watch_opcode)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
-
- BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+ struct ceph_osd_req_op *op;
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
op->watch.cookie = cookie;
- op->watch.ver = version;
- if (opcode == CEPH_OSD_OP_WATCH && flag)
- op->watch.flag = (u8)1;
+ op->watch.op = watch_opcode;
+ op->watch.gen = 0;
}
-EXPORT_SYMBOL(osd_req_op_watch_init);
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
}
}
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
- struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+ const struct ceph_osd_req_op *src)
{
- struct ceph_osd_req_op *src;
- struct ceph_osd_data *osd_data;
- u64 request_data_len = 0;
- u64 data_length;
-
- BUG_ON(which >= req->r_num_ops);
- src = &req->r_ops[which];
if (WARN_ON(!osd_req_opcode_valid(src->op))) {
pr_err("unrecognized osd opcode %d\n", src->op);
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
switch (src->op) {
case CEPH_OSD_OP_STAT:
- osd_data = &src->raw_data_in;
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
break;
case CEPH_OSD_OP_READ:
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
case CEPH_OSD_OP_ZERO:
case CEPH_OSD_OP_TRUNCATE:
- if (src->op == CEPH_OSD_OP_WRITE ||
- src->op == CEPH_OSD_OP_WRITEFULL)
- request_data_len = src->extent.length;
dst->extent.offset = cpu_to_le64(src->extent.offset);
dst->extent.length = cpu_to_le64(src->extent.length);
dst->extent.truncate_size =
cpu_to_le64(src->extent.truncate_size);
dst->extent.truncate_seq =
cpu_to_le32(src->extent.truncate_seq);
- osd_data = &src->extent.osd_data;
- if (src->op == CEPH_OSD_OP_WRITE ||
- src->op == CEPH_OSD_OP_WRITEFULL)
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- else
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
break;
case CEPH_OSD_OP_CALL:
dst->cls.class_len = src->cls.class_len;
dst->cls.method_len = src->cls.method_len;
- osd_data = &src->cls.request_info;
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
- request_data_len = osd_data->pagelist->length;
-
- osd_data = &src->cls.request_data;
- data_length = ceph_osd_data_length(osd_data);
- if (data_length) {
- BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
- dst->cls.indata_len = cpu_to_le32(data_length);
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- src->indata_len += data_length;
- request_data_len += data_length;
- }
- osd_data = &src->cls.response_data;
- ceph_osdc_msg_data_add(req->r_reply, osd_data);
+ dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
break;
case CEPH_OSD_OP_STARTSYNC:
break;
- case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_WATCH:
dst->watch.cookie = cpu_to_le64(src->watch.cookie);
- dst->watch.ver = cpu_to_le64(src->watch.ver);
- dst->watch.flag = src->watch.flag;
+ dst->watch.ver = cpu_to_le64(0);
+ dst->watch.op = src->watch.op;
+ dst->watch.gen = cpu_to_le32(src->watch.gen);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ dst->notify.cookie = cpu_to_le64(src->notify.cookie);
break;
case CEPH_OSD_OP_SETALLOCHINT:
dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
dst->xattr.cmp_op = src->xattr.cmp_op;
dst->xattr.cmp_mode = src->xattr.cmp_mode;
- osd_data = &src->xattr.osd_data;
- ceph_osdc_msg_data_add(req->r_request, osd_data);
- request_data_len = osd_data->pagelist->length;
break;
case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->indata_len);
- return request_data_len;
+ return src->indata_len;
}
/*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
GFP_NOFS);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->r_flags = flags;
+ if (!req) {
+ r = -ENOMEM;
+ goto fail;
+ }
/* calculate max write size */
r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
- if (r < 0) {
- ceph_osdc_put_request(req);
- return ERR_PTR(r);
- }
+ if (r)
+ goto fail;
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
+ req->r_flags = flags;
req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
+
+ req->r_snapid = vino.snap;
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ req->r_data_offset = off;
- snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
- "%llx.%08llx", vino.ino, objnum);
- req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+ r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (r)
+ goto fail;
return req;
+
+fail:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(r);
}
EXPORT_SYMBOL(ceph_osdc_new_request);
/*
* We keep osd requests in an rbtree, sorted by ->r_tid.
*/
-static void __insert_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *new)
-{
- struct rb_node **p = &osdc->requests.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_osd_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &osdc->requests);
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
- u64 tid)
+static bool osd_homeless(struct ceph_osd *osd)
{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
+ return osd->o_osd == CEPH_HOMELESS_OSD;
}
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
- u64 tid)
+static bool osd_registered(struct ceph_osd *osd)
{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid) {
- if (!n->rb_left)
- return req;
- n = n->rb_left;
- } else if (tid > req->r_tid) {
- n = n->rb_right;
- } else {
- return req;
- }
- }
- return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
- struct ceph_osd_client *osdc = req->r_osdc;
- struct ceph_osd *osd = req->r_osd;
+ verify_osdc_locked(osd->o_osdc);
- /*
- * Linger requests need to be resent with a new tid to avoid
- * the dup op detection logic on the OSDs. Achieve this with
- * a re-register dance instead of open-coding.
- */
- ceph_osdc_get_request(req);
- if (!list_empty(&req->r_linger_item))
- __unregister_linger_request(osdc, req);
- else
- __unregister_request(osdc, req);
- __register_request(osdc, req);
- ceph_osdc_put_request(req);
-
- /*
- * Unless request has been registered as both normal and
- * lingering, __unregister{,_linger}_request clears r_osd.
- * However, here we need to preserve r_osd to make sure we
- * requeue on the same OSD.
- */
- WARN_ON(req->r_osd || !osd);
- req->r_osd = osd;
-
- dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
- __enqueue_request(req);
+ return !RB_EMPTY_NODE(&osd->o_node);
}
/*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
*/
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
+static void osd_init(struct ceph_osd *osd)
{
- struct ceph_osd_request *req, *nreq;
- LIST_HEAD(resend);
- LIST_HEAD(resend_linger);
- int err;
-
- dout("%s osd%d\n", __func__, osd->o_osd);
- err = __reset_osd(osdc, osd);
- if (err)
- return;
-
- /*
- * Build up a list of requests to resend by traversing the
- * osd's list of requests. Requests for a given object are
- * sent in tid order, and that is also the order they're
- * kept on this list. Therefore all requests that are in
- * flight will be found first, followed by all requests that
- * have not yet been sent. And to resend requests while
- * preserving this order we will want to put any sent
- * requests back on the front of the osd client's unsent
- * list.
- *
- * So we build a separate ordered list of already-sent
- * requests for the affected osd and splice it onto the
- * front of the osd client's unsent list. Once we've seen a
- * request that has not yet been sent we're done. Those
- * requests are already sitting right where they belong.
- */
- list_for_each_entry(req, &osd->o_requests, r_osd_item) {
- if (!req->r_sent)
- break;
-
- if (!req->r_linger) {
- dout("%s requeueing %p tid %llu\n", __func__, req,
- req->r_tid);
- list_move_tail(&req->r_req_lru_item, &resend);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- } else {
- list_move_tail(&req->r_req_lru_item, &resend_linger);
- }
- }
- list_splice(&resend, &osdc->req_unsent);
-
- /*
- * Both registered and not yet registered linger requests are
- * enqueued with a new tid on the same OSD. We add/move them
- * to req_unsent/o_requests at the end to keep things in tid
- * order.
- */
- list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
- r_linger_osd_item) {
- WARN_ON(!list_empty(&req->r_req_lru_item));
- __kick_linger_request(req);
- }
-
- list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
- __kick_linger_request(req);
+ atomic_set(&osd->o_ref, 1);
+ RB_CLEAR_NODE(&osd->o_node);
+ osd->o_requests = RB_ROOT;
+ osd->o_linger_requests = RB_ROOT;
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ osd->o_incarnation = 1;
+ mutex_init(&osd->lock);
}
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static void osd_cleanup(struct ceph_osd *osd)
{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
-
- if (!osd)
- return;
- dout("osd_reset osd%d\n", osd->o_osd);
- osdc = osd->o_osdc;
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- __kick_osd_requests(osdc, osd);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ WARN_ON(!list_empty(&osd->o_osd_lru));
+ WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+ if (osd->o_auth.authorizer) {
+ WARN_ON(osd_homeless(osd));
+ ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ }
}
/*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
{
struct ceph_osd *osd;
- osd = kzalloc(sizeof(*osd), GFP_NOFS);
- if (!osd)
- return NULL;
+ WARN_ON(onum == CEPH_HOMELESS_OSD);
- atomic_set(&osd->o_ref, 1);
+ osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+ osd_init(osd);
osd->o_osdc = osdc;
osd->o_osd = onum;
- RB_CLEAR_NODE(&osd->o_node);
- INIT_LIST_HEAD(&osd->o_requests);
- INIT_LIST_HEAD(&osd->o_linger_requests);
- INIT_LIST_HEAD(&osd->o_osd_lru);
- osd->o_incarnation = 1;
ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
- INIT_LIST_HEAD(&osd->o_keepalive_item);
return osd;
}
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
atomic_read(&osd->o_ref) - 1);
if (atomic_dec_and_test(&osd->o_ref)) {
- if (osd->o_auth.authorizer)
- ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+ osd_cleanup(osd);
kfree(osd);
}
}
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
- WARN_ON(!list_empty(&osd->o_requests));
- WARN_ON(!list_empty(&osd->o_linger_requests));
-
- list_del_init(&osd->o_osd_lru);
- rb_erase(&osd->o_node, &osdc->osds);
- RB_CLEAR_NODE(&osd->o_node);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
- if (!RB_EMPTY_NODE(&osd->o_node)) {
- ceph_con_close(&osd->o_con);
- __remove_osd(osdc, osd);
- put_osd(osd);
- }
-}
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
-static void remove_all_osds(struct ceph_osd_client *osdc)
+static void __move_osd_to_lru(struct ceph_osd *osd)
{
- dout("%s %p\n", __func__, osdc);
- mutex_lock(&osdc->request_mutex);
- while (!RB_EMPTY_ROOT(&osdc->osds)) {
- struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
- struct ceph_osd, o_node);
- remove_osd(osdc, osd);
- }
- mutex_unlock(&osdc->request_mutex);
-}
+ struct ceph_osd_client *osdc = osd->o_osdc;
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
-{
- dout("%s %p\n", __func__, osd);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
BUG_ON(!list_empty(&osd->o_osd_lru));
+ spin_lock(&osdc->osd_lru_lock);
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
+
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
}
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
{
- dout("%s %p\n", __func__, osd);
-
- if (list_empty(&osd->o_requests) &&
- list_empty(&osd->o_linger_requests))
- __move_osd_to_lru(osdc, osd);
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests))
+ __move_osd_to_lru(osd);
}
static void __remove_osd_from_lru(struct ceph_osd *osd)
{
- dout("__remove_osd_from_lru %p\n", osd);
+ struct ceph_osd_client *osdc = osd->o_osdc;
+
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ spin_lock(&osdc->osd_lru_lock);
if (!list_empty(&osd->o_osd_lru))
list_del_init(&osd->o_osd_lru);
+ spin_unlock(&osdc->osd_lru_lock);
}
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
{
- struct ceph_osd *osd, *nosd;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
- dout("__remove_old_osds %p\n", osdc);
- mutex_lock(&osdc->request_mutex);
- list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
- if (time_before(jiffies, osd->lru_ttl))
- break;
- remove_osd(osdc, osd);
+ verify_osdc_wrlocked(osdc);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ ceph_con_close(&osd->o_con);
+
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+
+ n = rb_next(n); /* unlink_request() */
+
+ dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+ unlink_request(osd, req);
+ link_request(&osdc->homeless_osd, req);
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ n = rb_next(n); /* unlink_linger() */
+
+ dout(" reassigning lreq %p linger_id %llu\n", lreq,
+ lreq->linger_id);
+ unlink_linger(osd, lreq);
+ link_linger(&osdc->homeless_osd, lreq);
}
- mutex_unlock(&osdc->request_mutex);
+
+ __remove_osd_from_lru(osd);
+ erase_osd(&osdc->osds, osd);
+ put_osd(osd);
}
/*
* reset osd connect
*/
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
{
struct ceph_entity_addr *peer_addr;
- dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
- if (list_empty(&osd->o_requests) &&
- list_empty(&osd->o_linger_requests)) {
- remove_osd(osdc, osd);
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ if (RB_EMPTY_ROOT(&osd->o_requests) &&
+ RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+ close_osd(osd);
return -ENODEV;
}
- peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+ peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
!ceph_con_opened(&osd->o_con)) {
- struct ceph_osd_request *req;
+ struct rb_node *n;
dout("osd addr hasn't changed and connection never opened, "
"letting msgr retry\n");
/* touch each r_stamp for handle_timeout()'s benfit */
- list_for_each_entry(req, &osd->o_requests, r_osd_item)
+ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
req->r_stamp = jiffies;
+ }
return -EAGAIN;
}
@@ -1206,455 +1170,1369 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
return 0;
}
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+ bool wrlocked)
{
- struct rb_node **p = &osdc->osds.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd *osd = NULL;
+ struct ceph_osd *osd;
- dout("__insert_osd %p osd%d\n", new, new->o_osd);
- while (*p) {
- parent = *p;
- osd = rb_entry(parent, struct ceph_osd, o_node);
- if (new->o_osd < osd->o_osd)
- p = &(*p)->rb_left;
- else if (new->o_osd > osd->o_osd)
- p = &(*p)->rb_right;
- else
- BUG();
+ if (wrlocked)
+ verify_osdc_wrlocked(osdc);
+ else
+ verify_osdc_locked(osdc);
+
+ if (o != CEPH_HOMELESS_OSD)
+ osd = lookup_osd(&osdc->osds, o);
+ else
+ osd = &osdc->homeless_osd;
+ if (!osd) {
+ if (!wrlocked)
+ return ERR_PTR(-EAGAIN);
+
+ osd = create_osd(osdc, o);
+ insert_osd(&osdc->osds, osd);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+ &osdc->osdmap->osd_addr[osd->o_osd]);
}
- rb_link_node(&new->o_node, parent, p);
- rb_insert_color(&new->o_node, &osdc->osds);
+ dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+ return osd;
}
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
{
- struct ceph_osd *osd;
- struct rb_node *n = osdc->osds.rb_node;
-
- while (n) {
- osd = rb_entry(n, struct ceph_osd, o_node);
- if (o < osd->o_osd)
- n = n->rb_left;
- else if (o > osd->o_osd)
- n = n->rb_right;
- else
- return osd;
- }
- return NULL;
+ verify_osd_locked(osd);
+ WARN_ON(!req->r_tid || req->r_osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_request(&osd->o_requests, req);
+ req->r_osd = osd;
}
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
{
- schedule_delayed_work(&osdc->timeout_work,
- osdc->client->options->osd_keepalive_timeout);
+ verify_osd_locked(osd);
+ WARN_ON(req->r_osd != osd);
+ dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+ req, req->r_tid);
+
+ req->r_osd = NULL;
+ erase_request(&osd->o_requests, req);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
}
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
{
- cancel_delayed_work(&osdc->timeout_work);
+ return pi->flags & CEPH_POOL_FLAG_FULL;
}
-/*
- * Register request, assign tid. If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static bool have_pool_full(struct ceph_osd_client *osdc)
{
- req->r_tid = ++osdc->last_tid;
- req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
- dout("__register_request %p tid %lld\n", req, req->r_tid);
- __insert_request(osdc, req);
- ceph_osdc_get_request(req);
- osdc->num_requests++;
- if (osdc->num_requests == 1) {
- dout(" first request, scheduling timeout\n");
- __schedule_osd_timeout(osdc);
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ if (__pool_full(pi))
+ return true;
}
+
+ return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return __pool_full(pi);
}
/*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
*/
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+ const struct ceph_osd_request_target *t,
+ struct ceph_pg_pool_info *pi)
{
- if (RB_EMPTY_NODE(&req->r_node)) {
- dout("__unregister_request %p tid %lld not registered\n",
- req, req->r_tid);
- return;
+ bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ __pool_full(pi);
+
+ WARN_ON(pi->id != t->base_oloc.pool);
+ return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+ (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+enum calc_target_result {
+ CALC_TARGET_NO_ACTION = 0,
+ CALC_TARGET_NEED_RESEND,
+ CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+ struct ceph_osd_request_target *t,
+ u32 *last_force_resend,
+ bool any_change)
+{
+ struct ceph_pg_pool_info *pi;
+ struct ceph_pg pgid, last_pgid;
+ struct ceph_osds up, acting;
+ bool force_resend = false;
+ bool need_check_tiering = false;
+ bool need_resend = false;
+ bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+ enum calc_target_result ct_res;
+ int ret;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+ if (!pi) {
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
}
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &osdc->requests);
- RB_CLEAR_NODE(&req->r_node);
- osdc->num_requests--;
+ if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+ if (last_force_resend &&
+ *last_force_resend < pi->last_force_request_resend) {
+ *last_force_resend = pi->last_force_request_resend;
+ force_resend = true;
+ } else if (!last_force_resend) {
+ force_resend = true;
+ }
+ }
+ if (ceph_oid_empty(&t->target_oid) || force_resend) {
+ ceph_oid_copy(&t->target_oid, &t->base_oid);
+ need_check_tiering = true;
+ }
+ if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+ ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+ need_check_tiering = true;
+ }
- if (req->r_osd) {
- /* make sure the original request isn't in flight. */
- ceph_msg_revoke(req->r_request);
+ if (need_check_tiering &&
+ (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+ t->target_oloc.pool = pi->read_tier;
+ if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+ t->target_oloc.pool = pi->write_tier;
+ }
- list_del_init(&req->r_osd_item);
- maybe_move_osd_to_lru(osdc, req->r_osd);
- if (list_empty(&req->r_linger_osd_item))
- req->r_osd = NULL;
+ ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+ &t->target_oloc, &pgid);
+ if (ret) {
+ WARN_ON(ret != -ENOENT);
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
+ }
+ last_pgid.pool = pgid.pool;
+ last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+ ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+ if (any_change &&
+ ceph_is_new_interval(&t->acting,
+ &acting,
+ &t->up,
+ &up,
+ t->size,
+ pi->size,
+ t->min_size,
+ pi->min_size,
+ t->pg_num,
+ pi->pg_num,
+ t->sort_bitwise,
+ sort_bitwise,
+ &last_pgid))
+ force_resend = true;
+
+ if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+ t->paused = false;
+ need_resend = true;
}
- list_del_init(&req->r_req_lru_item);
- ceph_osdc_put_request(req);
+ if (ceph_pg_compare(&t->pgid, &pgid) ||
+ ceph_osds_changed(&t->acting, &acting, any_change) ||
+ force_resend) {
+ t->pgid = pgid; /* struct */
+ ceph_osds_copy(&t->acting, &acting);
+ ceph_osds_copy(&t->up, &up);
+ t->size = pi->size;
+ t->min_size = pi->min_size;
+ t->pg_num = pi->pg_num;
+ t->pg_num_mask = pi->pg_num_mask;
+ t->sort_bitwise = sort_bitwise;
+
+ t->osd = acting.primary;
+ need_resend = true;
+ }
+
+ ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+ dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+ return ct_res;
+}
+
+static void setup_request_data(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ u32 data_len = 0;
+ int i;
+
+ if (!list_empty(&msg->data))
+ return;
+
+ WARN_ON(msg->data_length);
+ for (i = 0; i < req->r_num_ops; i++) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ WARN_ON(op->indata_len != op->extent.length);
+ ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ WARN_ON(op->indata_len != op->xattr.name_len +
+ op->xattr.value_len);
+ ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ceph_osdc_msg_data_add(msg,
+ &op->notify_ack.request_data);
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->raw_data_in);
+ break;
+ case CEPH_OSD_OP_READ:
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->extent.osd_data);
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_CALL:
+ WARN_ON(op->indata_len != op->cls.class_len +
+ op->cls.method_len +
+ op->cls.indata_len);
+ ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+ /* optional, can be NONE */
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->cls.response_data);
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ ceph_osdc_msg_data_add(msg,
+ &op->notify.request_data);
+ ceph_osdc_msg_data_add(req->r_reply,
+ &op->notify.response_data);
+ break;
+ }
+
+ data_len += op->indata_len;
+ }
+
+ WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ u32 data_len = 0;
+ int i;
+
+ if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+ /* snapshots aren't writeable */
+ WARN_ON(req->r_snapid != CEPH_NOSNAP);
+ } else {
+ WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+ req->r_data_offset || req->r_snapc);
+ }
+
+ setup_request_data(req, msg);
+
+ ceph_encode_32(&p, 1); /* client_inc, always 1 */
+ ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+ ceph_encode_32(&p, req->r_flags);
+ ceph_encode_timespec(p, &req->r_mtime);
+ p += sizeof(struct ceph_timespec);
+ /* aka reassert_version */
+ memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+ p += sizeof(req->r_replay_version);
+
+ /* oloc */
+ ceph_encode_8(&p, 4);
+ ceph_encode_8(&p, 4);
+ ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_encode_64(&p, req->r_t.target_oloc.pool);
+ ceph_encode_32(&p, -1); /* preferred */
+ ceph_encode_32(&p, 0); /* key len */
+
+ /* pgid */
+ ceph_encode_8(&p, 1);
+ ceph_encode_64(&p, req->r_t.pgid.pool);
+ ceph_encode_32(&p, req->r_t.pgid.seed);
+ ceph_encode_32(&p, -1); /* preferred */
+
+ /* oid */
+ ceph_encode_32(&p, req->r_t.target_oid.name_len);
+ memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+ p += req->r_t.target_oid.name_len;
- if (osdc->num_requests == 0) {
- dout(" no requests, canceling timeout\n");
- __cancel_osd_timeout(osdc);
+ /* ops, can imply data */
+ ceph_encode_16(&p, req->r_num_ops);
+ for (i = 0; i < req->r_num_ops; i++) {
+ data_len += osd_req_encode_op(p, &req->r_ops[i]);
+ p += sizeof(struct ceph_osd_op);
}
+
+ ceph_encode_64(&p, req->r_snapid); /* snapid */
+ if (req->r_snapc) {
+ ceph_encode_64(&p, req->r_snapc->seq);
+ ceph_encode_32(&p, req->r_snapc->num_snaps);
+ for (i = 0; i < req->r_snapc->num_snaps; i++)
+ ceph_encode_64(&p, req->r_snapc->snaps[i]);
+ } else {
+ ceph_encode_64(&p, 0); /* snap_seq */
+ ceph_encode_32(&p, 0); /* snaps len */
+ }
+
+ ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+ BUG_ON(p > end);
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ msg->hdr.data_len = cpu_to_le32(data_len);
+ /*
+ * The header "data_off" is a hint to the receiver allowing it
+ * to align received data into its buffers such that there's no
+ * need to re-copy it before writing it to disk (direct I/O).
+ */
+ msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+ dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__,
+ req, req->r_t.target_oid.name, req->r_t.target_oid.name_len,
+ msg->front.iov_len, data_len);
}
/*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
*/
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
{
- if (req->r_sent && req->r_osd) {
+ struct ceph_osd *osd = req->r_osd;
+
+ verify_osd_locked(osd);
+ WARN_ON(osd->o_osd != req->r_t.osd);
+
+ /*
+ * We may have a previously queued request message hanging
+ * around. Cancel it to avoid corrupting the msgr.
+ */
+ if (req->r_sent)
ceph_msg_revoke(req->r_request);
- req->r_sent = 0;
+
+ req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+ if (req->r_attempts)
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ else
+ WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+ encode_request(req, req->r_request);
+
+ dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+ __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+ req->r_t.osd, req->r_flags, req->r_attempts);
+
+ req->r_t.paused = false;
+ req->r_stamp = jiffies;
+ req->r_attempts++;
+
+ req->r_sent = osd->o_incarnation;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+ bool continuous = false;
+
+ verify_osdc_locked(osdc);
+ WARN_ON(!osdc->osdmap->epoch);
+
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("%s osdc %p continuous\n", __func__, osdc);
+ continuous = true;
+ } else {
+ dout("%s osdc %p onetime\n", __func__, osdc);
}
+
+ if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch + 1, continuous))
+ ceph_monc_renew_subs(&osdc->client->monc);
}
-static void __register_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
{
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
- WARN_ON(!req->r_linger);
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd;
+ enum calc_target_result ct_res;
+ bool need_send = false;
+ bool promoted = false;
+
+ WARN_ON(req->r_tid || req->r_got_reply);
+ dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+ ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+ if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+ goto promote;
+
+ osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+ if (IS_ERR(osd)) {
+ WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+ goto promote;
+ }
+
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ dout("req %p pausewr\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("req %p pauserd\n", req);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+ CEPH_OSD_FLAG_FULL_FORCE)) &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.base_oloc.pool))) {
+ dout("req %p full/pool_full\n", req);
+ pr_warn_ratelimited("FULL or reached pool quota\n");
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if (!osd_homeless(osd)) {
+ need_send = true;
+ } else {
+ maybe_request_map(osdc);
+ }
+
+ mutex_lock(&osd->lock);
+ /*
+ * Assign the tid atomically with send_request() to protect
+ * multiple writes to the same object from racing with each
+ * other, resulting in out of order ops on the OSDs.
+ */
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(osd, req);
+ if (need_send)
+ send_request(req);
+ mutex_unlock(&osd->lock);
+ if (ct_res == CALC_TARGET_POOL_DNE)
+ send_map_check(req);
+
+ if (promoted)
+ downgrade_write(&osdc->lock);
+ return;
+
+promote:
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ wrlocked = true;
+ promoted = true;
+ goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+ unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+ if (req->r_flags & CEPH_OSD_FLAG_READ) {
+ WARN_ON(req->r_flags & mask);
+ req->r_flags |= CEPH_OSD_FLAG_ACK;
+ } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+ WARN_ON(!(req->r_flags & mask));
+ else
+ WARN_ON(1);
+
+ WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+ atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
ceph_osdc_get_request(req);
- list_add_tail(&req->r_linger_item, &osdc->req_linger);
- if (req->r_osd)
- list_add_tail(&req->r_linger_osd_item,
- &req->r_osd->o_linger_requests);
+ account_request(req);
+ __submit_request(req, wrlocked);
}
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
{
- WARN_ON(!req->r_linger);
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd = req->r_osd;
- if (list_empty(&req->r_linger_item)) {
- dout("%s %p tid %llu not registered\n", __func__, req,
- req->r_tid);
+ verify_osd_locked(osd);
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+ unlink_request(osd, req);
+ atomic_dec(&osdc->num_requests);
+
+ /*
+ * If an OSD has failed or returned and a request has been sent
+ * twice, it's possible to get a reply and end up here while the
+ * request message is queued for delivery. We will ignore the
+ * reply, so not a big deal, but better to try and catch it.
+ */
+ ceph_msg_revoke(req->r_request);
+ ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+ __finish_request(req);
+ ceph_osdc_put_request(req);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+ if (req->r_callback)
+ req->r_callback(req);
+ else
+ complete_all(&req->r_completion);
+}
+
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+ dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+ req->r_result = err;
+ __finish_request(req);
+ __complete_request(req);
+ complete_all(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (!lookup_req)
return;
+
+ WARN_ON(lookup_req != req);
+ erase_request_mc(&osdc->map_checks, req);
+ ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+ cancel_map_check(req);
+ finish_request(req);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (req->r_attempts) {
+ /*
+ * We sent a request earlier, which means that
+ * previously the pool existed, and now it does not
+ * (i.e., it was deleted).
+ */
+ req->r_map_dne_bound = map->epoch;
+ dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+ req->r_tid);
+ } else {
+ dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, map->epoch);
+ }
+
+ if (req->r_map_dne_bound) {
+ if (map->epoch >= req->r_map_dne_bound) {
+ /* we had a new enough map */
+ pr_info_ratelimited("tid %llu pool does not exist\n",
+ req->r_tid);
+ complete_request(req, -ENOENT);
+ }
+ } else {
+ send_map_check(req);
}
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_request *req;
+ u64 tid = greq->private_data;
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
- list_del_init(&req->r_linger_item);
+ WARN_ON(greq->result || !greq->u.newest);
- if (req->r_osd) {
- list_del_init(&req->r_linger_osd_item);
- maybe_move_osd_to_lru(osdc, req->r_osd);
- if (list_empty(&req->r_osd_item))
- req->r_osd = NULL;
+ down_write(&osdc->lock);
+ req = lookup_request_mc(&osdc->map_checks, tid);
+ if (!req) {
+ dout("%s tid %llu dne\n", __func__, tid);
+ goto out_unlock;
}
+
+ dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+ req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+ if (!req->r_map_dne_bound)
+ req->r_map_dne_bound = greq->u.newest;
+ erase_request_mc(&osdc->map_checks, req);
+ check_pool_dne(req);
+
ceph_osdc_put_request(req);
+out_unlock:
+ up_write(&osdc->lock);
}
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req)
{
- if (!req->r_linger) {
- dout("set_request_linger %p\n", req);
- req->r_linger = 1;
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_request *lookup_req;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+ if (lookup_req) {
+ WARN_ON(lookup_req != req);
+ return;
}
+
+ ceph_osdc_get_request(req);
+ insert_request_mc(&osdc->map_checks, req);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ map_check_cb, req->r_tid);
+ WARN_ON(ret);
}
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
/*
- * Returns whether a request should be blocked from being sent
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
+ * lingering requests, watch/notify v2 infrastructure
*/
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void linger_release(struct kref *kref)
+{
+ struct ceph_osd_linger_request *lreq =
+ container_of(kref, struct ceph_osd_linger_request, kref);
+
+ dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+ lreq->reg_req, lreq->ping_req);
+ WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+ WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+ WARN_ON(!list_empty(&lreq->scan_item));
+ WARN_ON(!list_empty(&lreq->pending_lworks));
+ WARN_ON(lreq->osd);
+
+ if (lreq->reg_req)
+ ceph_osdc_put_request(lreq->reg_req);
+ if (lreq->ping_req)
+ ceph_osdc_put_request(lreq->ping_req);
+ target_destroy(&lreq->t);
+ kfree(lreq);
+}
+
+static void linger_put(struct ceph_osd_linger_request *lreq)
{
- bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
- bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
- return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ if (lreq)
+ kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+ kref_get(&lreq->kref);
+ return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_linger_request *lreq;
+
+ lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+ if (!lreq)
+ return NULL;
+
+ kref_init(&lreq->kref);
+ mutex_init(&lreq->lock);
+ RB_CLEAR_NODE(&lreq->node);
+ RB_CLEAR_NODE(&lreq->osdc_node);
+ RB_CLEAR_NODE(&lreq->mc_node);
+ INIT_LIST_HEAD(&lreq->scan_item);
+ INIT_LIST_HEAD(&lreq->pending_lworks);
+ init_completion(&lreq->reg_commit_wait);
+ init_completion(&lreq->notify_finish_wait);
+
+ lreq->osdc = osdc;
+ target_init(&lreq->t);
+
+ dout("%s lreq %p\n", __func__, lreq);
+ return lreq;
}
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
/*
- * Calculate mapping of a request to a PG. Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
*/
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
- struct ceph_osd_request *req,
- struct ceph_pg *pg_out)
+static void link_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
{
- bool need_check_tiering;
+ verify_osd_locked(osd);
+ WARN_ON(!lreq->linger_id || lreq->osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
- need_check_tiering = false;
- if (req->r_target_oloc.pool == -1) {
- req->r_target_oloc = req->r_base_oloc; /* struct */
- need_check_tiering = true;
+ if (!osd_homeless(osd))
+ __remove_osd_from_lru(osd);
+ else
+ atomic_inc(&osd->o_osdc->num_homeless);
+
+ get_osd(osd);
+ insert_linger(&osd->o_linger_requests, lreq);
+ lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+ struct ceph_osd_linger_request *lreq)
+{
+ verify_osd_locked(osd);
+ WARN_ON(lreq->osd != osd);
+ dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+ osd->o_osd, lreq, lreq->linger_id);
+
+ lreq->osd = NULL;
+ erase_linger(&osd->o_linger_requests, lreq);
+ put_osd(osd);
+
+ if (!osd_homeless(osd))
+ maybe_move_osd_to_lru(osd);
+ else
+ atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ verify_osdc_locked(lreq->osdc);
+
+ return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ bool registered;
+
+ down_read(&osdc->lock);
+ registered = __linger_registered(lreq);
+ up_read(&osdc->lock);
+
+ return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(lreq->linger_id);
+
+ linger_get(lreq);
+ lreq->linger_id = ++osdc->last_linger_id;
+ insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_osdc_wrlocked(osdc);
+
+ erase_linger_osdc(&osdc->linger_requests, lreq);
+ linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ WARN_ON(!req->r_linger);
+ cancel_request(req);
+ linger_put(lreq);
+}
+
+struct linger_work {
+ struct work_struct work;
+ struct ceph_osd_linger_request *lreq;
+ struct list_head pending_item;
+ unsigned long queued_stamp;
+
+ union {
+ struct {
+ u64 notify_id;
+ u64 notifier_id;
+ void *payload; /* points into @msg front */
+ size_t payload_len;
+
+ struct ceph_msg *msg; /* for ceph_msg_put() */
+ } notify;
+ struct {
+ int err;
+ } error;
+ };
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+ work_func_t workfn)
+{
+ struct linger_work *lwork;
+
+ lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+ if (!lwork)
+ return NULL;
+
+ INIT_WORK(&lwork->work, workfn);
+ INIT_LIST_HEAD(&lwork->pending_item);
+ lwork->lreq = linger_get(lreq);
+
+ return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ mutex_lock(&lreq->lock);
+ list_del(&lwork->pending_item);
+ mutex_unlock(&lreq->lock);
+
+ linger_put(lreq);
+ kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+ struct ceph_osd_client *osdc = lreq->osdc;
+
+ verify_lreq_locked(lreq);
+ WARN_ON(!list_empty(&lwork->pending_item));
+
+ lwork->queued_stamp = jiffies;
+ list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+ queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
}
- if (req->r_target_oid.name_len == 0) {
- ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
- need_check_tiering = true;
+
+ WARN_ON(!lreq->is_watch);
+ dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+ __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+ lwork->notify.payload_len);
+ lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+ lwork->notify.notifier_id, lwork->notify.payload,
+ lwork->notify.payload_len);
+
+out:
+ ceph_msg_put(lwork->notify.msg);
+ lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+ struct linger_work *lwork = container_of(w, struct linger_work, work);
+ struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+ if (!linger_registered(lreq)) {
+ dout("%s lreq %p not registered\n", __func__, lreq);
+ goto out;
}
- if (need_check_tiering &&
- (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
- struct ceph_pg_pool_info *pi;
-
- pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
- if (pi) {
- if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
- pi->read_tier >= 0)
- req->r_target_oloc.pool = pi->read_tier;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- pi->write_tier >= 0)
- req->r_target_oloc.pool = pi->write_tier;
+ dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+ lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+ lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+ struct linger_work *lwork;
+
+ lwork = lwork_alloc(lreq, do_watch_error);
+ if (!lwork) {
+ pr_err("failed to allocate error-lwork\n");
+ return;
+ }
+
+ lwork->error.err = lreq->last_error;
+ lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+ int result)
+{
+ if (!completion_done(&lreq->reg_commit_wait)) {
+ lreq->reg_commit_error = (result <= 0 ? result : 0);
+ complete_all(&lreq->reg_commit_wait);
+ }
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+ lreq->linger_id, req->r_result);
+ WARN_ON(!__linger_registered(lreq));
+ linger_reg_commit_complete(lreq, req->r_result);
+ lreq->committed = true;
+
+ if (!lreq->is_watch) {
+ struct ceph_osd_data *osd_data =
+ osd_req_op_data(req, 0, notify, response_data);
+ void *p = page_address(osd_data->pages[0]);
+
+ WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+ osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+ /* make note of the notify_id */
+ if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+ lreq->notify_id = ceph_decode_64(&p);
+ dout("lreq %p notify_id %llu\n", lreq,
+ lreq->notify_id);
+ } else {
+ dout("lreq %p no notify_id\n", lreq);
}
- /* !pi is caught in ceph_oloc_oid_to_pg() */
}
- return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
- &req->r_target_oid, pg_out);
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
}
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
{
- struct ceph_osd_client *osdc = req->r_osdc;
+ /*
+ * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+ * notification and a failure to reconnect because we raced with
+ * the delete appear the same to the user.
+ */
+ if (err == -ENOENT)
+ err = -ENOTCONN;
+
+ return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+ lreq, lreq->linger_id, req->r_result, lreq->last_error);
+ if (req->r_result < 0) {
+ if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
+ }
+ }
+
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_request *req = lreq->reg_req;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
- dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
+ verify_osdc_wrlocked(req->r_osdc);
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
- if (req->r_osd) {
- __remove_osd_from_lru(req->r_osd);
- list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
- list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+ if (req->r_osd)
+ cancel_linger_request(req);
+
+ request_reinit(req);
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = lreq->t.flags;
+ req->r_mtime = lreq->mtime;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed) {
+ WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+ op->watch.cookie != lreq->linger_id);
+ op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+ op->watch.gen = ++lreq->register_gen;
+ dout("lreq %p reconnect register_gen %u\n", lreq,
+ op->watch.gen);
+ req->r_callback = linger_reconnect_cb;
} else {
- list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+ if (!lreq->is_watch)
+ lreq->notify_id = 0;
+ else
+ WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+ dout("lreq %p register\n", lreq);
+ req->r_callback = linger_commit_cb;
}
+ mutex_unlock(&lreq->lock);
+
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+
+ submit_request(req, true);
}
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately. If there is
- * no up osd, set r_osd to NULL. Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req, int force_resend)
+static void linger_ping_cb(struct ceph_osd_request *req)
{
- struct ceph_pg pgid;
- int acting[CEPH_PG_MAX_SIZE];
- int num, o;
- int err;
- bool was_paused;
-
- dout("map_request %p tid %lld\n", req, req->r_tid);
-
- err = __calc_request_pg(osdc->osdmap, req, &pgid);
- if (err) {
- list_move(&req->r_req_lru_item, &osdc->req_notarget);
- return err;
- }
- req->r_pgid = pgid;
-
- num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
- if (num < 0)
- num = 0;
-
- was_paused = req->r_paused;
- req->r_paused = __req_should_be_paused(osdc, req);
- if (was_paused && !req->r_paused)
- force_resend = 1;
-
- if ((!force_resend &&
- req->r_osd && req->r_osd->o_osd == o &&
- req->r_sent >= req->r_osd->o_incarnation &&
- req->r_num_pg_osds == num &&
- memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1) ||
- req->r_paused)
- return 0; /* no change */
-
- dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
- req->r_tid, pgid.pool, pgid.seed, o,
- req->r_osd ? req->r_osd->o_osd : -1);
-
- /* record full pg acting set */
- memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
- req->r_num_pg_osds = num;
-
- if (req->r_osd) {
- __cancel_request(req);
- list_del_init(&req->r_osd_item);
- list_del_init(&req->r_linger_osd_item);
- req->r_osd = NULL;
- }
-
- req->r_osd = __lookup_osd(osdc, o);
- if (!req->r_osd && o >= 0) {
- err = -ENOMEM;
- req->r_osd = create_osd(osdc, o);
- if (!req->r_osd) {
- list_move(&req->r_req_lru_item, &osdc->req_notarget);
- goto out;
+ struct ceph_osd_linger_request *lreq = req->r_priv;
+
+ mutex_lock(&lreq->lock);
+ dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+ __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+ lreq->last_error);
+ if (lreq->register_gen == req->r_ops[0].watch.gen) {
+ if (!req->r_result) {
+ lreq->watch_valid_thru = lreq->ping_sent;
+ } else if (!lreq->last_error) {
+ lreq->last_error = normalize_watch_error(req->r_result);
+ queue_watch_error(lreq);
}
+ } else {
+ dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+ lreq->register_gen, req->r_ops[0].watch.gen);
+ }
+
+ mutex_unlock(&lreq->lock);
+ linger_put(lreq);
+}
- dout("map_request osd %p is osd%d\n", req->r_osd, o);
- __insert_osd(osdc, req->r_osd);
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_request *req = lreq->ping_req;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
- ceph_con_open(&req->r_osd->o_con,
- CEPH_ENTITY_TYPE_OSD, o,
- &osdc->osdmap->osd_addr[o]);
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+ dout("%s PAUSERD\n", __func__);
+ return;
}
- __enqueue_request(req);
- err = 1; /* osd or pg changed */
+ lreq->ping_sent = jiffies;
+ dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+ __func__, lreq, lreq->linger_id, lreq->ping_sent,
+ lreq->register_gen);
-out:
- return err;
+ if (req->r_osd)
+ cancel_linger_request(req);
+
+ request_reinit(req);
+ target_copy(&req->r_t, &lreq->t);
+
+ WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+ op->watch.cookie != lreq->linger_id ||
+ op->watch.op != CEPH_OSD_WATCH_OP_PING);
+ op->watch.gen = lreq->register_gen;
+ req->r_callback = linger_ping_cb;
+ req->r_priv = linger_get(lreq);
+ req->r_linger = true;
+
+ ceph_osdc_get_request(req);
+ account_request(req);
+ req->r_tid = atomic64_inc_return(&osdc->last_tid);
+ link_request(lreq->osd, req);
+ send_request(req);
}
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+static void linger_submit(struct ceph_osd_linger_request *lreq)
{
- void *p;
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd *osd;
- dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
- req, req->r_tid, req->r_osd->o_osd, req->r_flags,
- (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+ calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ link_linger(osd, lreq);
- /* fill in message content that changes each time we send it */
- put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
- put_unaligned_le32(req->r_flags, req->r_request_flags);
- put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
- p = req->r_request_pgid;
- ceph_encode_64(&p, req->r_pgid.pool);
- ceph_encode_32(&p, req->r_pgid.seed);
- put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
- memcpy(req->r_request_reassert_version, &req->r_reassert_version,
- sizeof(req->r_reassert_version));
+ send_linger(lreq);
+}
- req->r_stamp = jiffies;
- list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
- ceph_msg_get(req->r_request); /* send consumes a ref */
+ verify_osdc_wrlocked(osdc);
- req->r_sent = req->r_osd->o_incarnation;
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (!lookup_lreq)
+ return;
- ceph_con_send(&req->r_osd->o_con, req->r_request);
+ WARN_ON(lookup_lreq != lreq);
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ linger_put(lreq);
}
/*
- * Send any requests in the queue (req_unsent).
+ * @lreq has to be both registered and linked.
*/
-static void __send_queued(struct ceph_osd_client *osdc)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
{
- struct ceph_osd_request *req, *tmp;
+ if (lreq->is_watch && lreq->ping_req->r_osd)
+ cancel_linger_request(lreq->ping_req);
+ if (lreq->reg_req->r_osd)
+ cancel_linger_request(lreq->reg_req);
+ cancel_linger_map_check(lreq);
+ unlink_linger(lreq->osd, lreq);
+ linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
- dout("__send_queued\n");
- list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
- __send_request(osdc, req);
+ down_write(&osdc->lock);
+ if (__linger_registered(lreq))
+ __linger_cancel(lreq);
+ up_write(&osdc->lock);
}
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
-{
- int rc;
-
- __register_request(osdc, req);
- req->r_sent = 0;
- req->r_got_reply = 0;
- rc = __map_request(osdc, req, 0);
- if (rc < 0) {
- if (nofail) {
- dout("osdc_start_request failed map, "
- " will retry %lld\n", req->r_tid);
- rc = 0;
- } else {
- __unregister_request(osdc, req);
- }
- return rc;
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osdmap *map = osdc->osdmap;
+
+ verify_osdc_wrlocked(osdc);
+ WARN_ON(!map->epoch);
+
+ if (lreq->register_gen) {
+ lreq->map_dne_bound = map->epoch;
+ dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+ lreq, lreq->linger_id);
+ } else {
+ dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ map->epoch);
}
- if (req->r_osd == NULL) {
- dout("send_request %p no up osds in pg\n", req);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ if (lreq->map_dne_bound) {
+ if (map->epoch >= lreq->map_dne_bound) {
+ /* we had a new enough map */
+ pr_info("linger_id %llu pool does not exist\n",
+ lreq->linger_id);
+ linger_reg_commit_complete(lreq, -ENOENT);
+ __linger_cancel(lreq);
+ }
} else {
- __send_queued(osdc);
+ send_linger_map_check(lreq);
}
+}
- return 0;
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+ struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+ struct ceph_osd_linger_request *lreq;
+ u64 linger_id = greq->private_data;
+
+ WARN_ON(greq->result || !greq->u.newest);
+
+ down_write(&osdc->lock);
+ lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+ if (!lreq) {
+ dout("%s linger_id %llu dne\n", __func__, linger_id);
+ goto out_unlock;
+ }
+
+ dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+ __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+ greq->u.newest);
+ if (!lreq->map_dne_bound)
+ lreq->map_dne_bound = greq->u.newest;
+ erase_linger_mc(&osdc->linger_map_checks, lreq);
+ check_linger_pool_dne(lreq);
+
+ linger_put(lreq);
+out_unlock:
+ up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_linger_request *lookup_lreq;
+ int ret;
+
+ verify_osdc_wrlocked(osdc);
+
+ lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+ lreq->linger_id);
+ if (lookup_lreq) {
+ WARN_ON(lookup_lreq != lreq);
+ return;
+ }
+
+ linger_get(lreq);
+ insert_linger_mc(&osdc->linger_map_checks, lreq);
+ ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+ linger_map_check_cb, lreq->linger_id);
+ WARN_ON(ret);
+}
+
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+ return ret ?: lreq->reg_commit_error;
+}
+
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+ int ret;
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+ ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+ return ret ?: lreq->notify_finish_error;
}
/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds. When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected. Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
+ * Timeout callback, called every N seconds. When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
*/
static void handle_timeout(struct work_struct *work)
{
struct ceph_osd_client *osdc =
container_of(work, struct ceph_osd_client, timeout_work.work);
struct ceph_options *opts = osdc->client->options;
- struct ceph_osd_request *req;
- struct ceph_osd *osd;
- struct list_head slow_osds;
- dout("timeout\n");
- down_read(&osdc->map_sem);
-
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+ LIST_HEAD(slow_osds);
+ struct rb_node *n, *p;
- mutex_lock(&osdc->request_mutex);
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
/*
* ping osds that are a bit slow. this ensures that if there
* is a break in the TCP connection we will notice, and reopen
* a connection with that osd (from the fault callback).
*/
- INIT_LIST_HEAD(&slow_osds);
- list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
- if (time_before(jiffies,
- req->r_stamp + opts->osd_keepalive_timeout))
- break;
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ bool found = false;
+
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (time_before(req->r_stamp, cutoff)) {
+ dout(" req %p tid %llu on osd%d is laggy\n",
+ req, req->r_tid, osd->o_osd);
+ found = true;
+ }
+ }
+ for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(p, struct ceph_osd_linger_request, node);
+
+ dout(" lreq %p linger_id %llu is served by osd%d\n",
+ lreq, lreq->linger_id, osd->o_osd);
+ found = true;
+
+ mutex_lock(&lreq->lock);
+ if (lreq->is_watch && lreq->committed && !lreq->last_error)
+ send_linger_ping(lreq);
+ mutex_unlock(&lreq->lock);
+ }
- osd = req->r_osd;
- BUG_ON(!osd);
- dout(" tid %llu is slow, will send keepalive on osd%d\n",
- req->r_tid, osd->o_osd);
- list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ if (found)
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
}
+
+ if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+ maybe_request_map(osdc);
+
while (!list_empty(&slow_osds)) {
- osd = list_entry(slow_osds.next, struct ceph_osd,
- o_keepalive_item);
+ struct ceph_osd *osd = list_first_entry(&slow_osds,
+ struct ceph_osd,
+ o_keepalive_item);
list_del_init(&osd->o_keepalive_item);
ceph_con_keepalive(&osd->o_con);
}
- __schedule_osd_timeout(osdc);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ up_write(&osdc->lock);
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
}
static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2541,20 @@ static void handle_osds_timeout(struct work_struct *work)
container_of(work, struct ceph_osd_client,
osds_timeout_work.work);
unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+ struct ceph_osd *osd, *nosd;
- dout("osds timeout\n");
- down_read(&osdc->map_sem);
- remove_old_osds(osdc);
- up_read(&osdc->map_sem);
+ dout("%s osdc %p\n", __func__, osdc);
+ down_write(&osdc->lock);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (time_before(jiffies, osd->lru_ttl))
+ break;
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ close_osd(osd);
+ }
+
+ up_write(&osdc->lock);
schedule_delayed_work(&osdc->osds_timeout_work,
round_jiffies_relative(delay));
}
@@ -1776,107 +2662,76 @@ e_inval:
goto out;
}
-static void complete_request(struct ceph_osd_request *req)
-{
- complete_all(&req->r_safe_completion); /* fsync waiter */
-}
+struct MOSDOpReply {
+ struct ceph_pg pgid;
+ u64 flags;
+ int result;
+ u32 epoch;
+ int num_ops;
+ u32 outdata_len[CEPH_OSD_MAX_OPS];
+ s32 rval[CEPH_OSD_MAX_OPS];
+ int retry_attempt;
+ struct ceph_eversion replay_version;
+ u64 user_version;
+ struct ceph_request_redirect redirect;
+};
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
{
- void *p, *end;
- struct ceph_osd_request *req;
- struct ceph_request_redirect redir;
- u64 tid;
- int object_len;
- unsigned int numops;
- int payload_len, flags;
- s32 result;
- s32 retry_attempt;
- struct ceph_pg pg;
- int err;
- u32 reassert_epoch;
- u64 reassert_version;
- u32 osdmap_epoch;
- int already_completed;
- u32 bytes;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ u16 version = le16_to_cpu(msg->hdr.version);
+ struct ceph_eversion bad_replay_version;
u8 decode_redir;
- unsigned int i;
-
- tid = le64_to_cpu(msg->hdr.tid);
- dout("handle_reply %p tid %llu\n", msg, tid);
+ u32 len;
+ int ret;
+ int i;
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ ceph_decode_32_safe(&p, end, len, e_inval);
+ ceph_decode_need(&p, end, len, e_inval);
+ p += len; /* skip oid */
- ceph_decode_need(&p, end, 4, bad);
- object_len = ceph_decode_32(&p);
- ceph_decode_need(&p, end, object_len, bad);
- p += object_len;
+ ret = ceph_decode_pgid(&p, end, &m->pgid);
+ if (ret)
+ return ret;
- err = ceph_decode_pgid(&p, end, &pg);
- if (err)
- goto bad;
+ ceph_decode_64_safe(&p, end, m->flags, e_inval);
+ ceph_decode_32_safe(&p, end, m->result, e_inval);
+ ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+ memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+ p += sizeof(bad_replay_version);
+ ceph_decode_32_safe(&p, end, m->epoch, e_inval);
- ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
- flags = ceph_decode_64(&p);
- result = ceph_decode_32(&p);
- reassert_epoch = ceph_decode_32(&p);
- reassert_version = ceph_decode_64(&p);
- osdmap_epoch = ceph_decode_32(&p);
-
- /* lookup */
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (req == NULL) {
- dout("handle_reply tid %llu dne\n", tid);
- goto bad_mutex;
- }
- ceph_osdc_get_request(req);
+ ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+ if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+ goto e_inval;
- dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
- req, result);
-
- ceph_decode_need(&p, end, 4, bad_put);
- numops = ceph_decode_32(&p);
- if (numops > CEPH_OSD_MAX_OPS)
- goto bad_put;
- if (numops != req->r_num_ops)
- goto bad_put;
- payload_len = 0;
- ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
- for (i = 0; i < numops; i++) {
+ ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+ e_inval);
+ for (i = 0; i < m->num_ops; i++) {
struct ceph_osd_op *op = p;
- int len;
- len = le32_to_cpu(op->payload_len);
- req->r_ops[i].outdata_len = len;
- dout(" op %d has %d bytes\n", i, len);
- payload_len += len;
+ m->outdata_len[i] = le32_to_cpu(op->payload_len);
p += sizeof(*op);
}
- bytes = le32_to_cpu(msg->hdr.data_len);
- if (payload_len != bytes) {
- pr_warn("sum of op payload lens %d != data_len %d\n",
- payload_len, bytes);
- goto bad_put;
- }
- ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
- retry_attempt = ceph_decode_32(&p);
- for (i = 0; i < numops; i++)
- req->r_ops[i].rval = ceph_decode_32(&p);
+ ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+ for (i = 0; i < m->num_ops; i++)
+ ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
- if (le16_to_cpu(msg->hdr.version) >= 6) {
- p += 8 + 4; /* skip replay_version */
- p += 8; /* skip user_version */
+ if (version >= 5) {
+ ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+ memcpy(&m->replay_version, p, sizeof(m->replay_version));
+ p += sizeof(m->replay_version);
+ ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+ } else {
+ m->replay_version = bad_replay_version; /* struct */
+ m->user_version = le64_to_cpu(m->replay_version.version);
+ }
- if (le16_to_cpu(msg->hdr.version) >= 7)
- ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+ if (version >= 6) {
+ if (version >= 7)
+ ceph_decode_8_safe(&p, end, decode_redir, e_inval);
else
decode_redir = 1;
} else {
@@ -1884,228 +2739,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
}
if (decode_redir) {
- err = ceph_redirect_decode(&p, end, &redir);
- if (err)
- goto bad_put;
+ ret = ceph_redirect_decode(&p, end, &m->redirect);
+ if (ret)
+ return ret;
} else {
- redir.oloc.pool = -1;
+ ceph_oloc_init(&m->redirect.oloc);
}
- if (redir.oloc.pool != -1) {
- dout("redirect pool %lld\n", redir.oloc.pool);
+ return 0;
- __unregister_request(osdc, req);
+e_inval:
+ return -EINVAL;
+}
- req->r_target_oloc = redir.oloc; /* struct */
+/*
+ * We are done with @req if
+ * - @m is a safe reply, or
+ * - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+ const struct MOSDOpReply *m)
+{
+ return (m->result < 0 ||
+ (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+ !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
- /*
- * Start redirect requests with nofail=true. If
- * mapping fails, request will end up on the notarget
- * list, waiting for the new osdmap (which can take
- * a while), even though the original request mapped
- * successfully. In the future we might want to follow
- * original request's nofail setting here.
- */
- err = __ceph_osdc_start_request(osdc, req, true);
- BUG_ON(err);
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set? yes no
+ *
+ * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
+ * any or needed/got safe) r_safe_completion r_safe_completion
+ *
+ * first reply is unsafe r_unsafe_cb(true) (nothing)
+ *
+ * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
+ * r_safe_completion r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_osd_request *req;
+ struct MOSDOpReply m;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ u32 data_len = 0;
+ bool already_acked;
+ int ret;
+ int i;
- goto out_unlock;
- }
+ dout("%s msg %p tid %llu\n", __func__, msg, tid);
- already_completed = req->r_got_reply;
- if (!req->r_got_reply) {
- req->r_result = result;
- dout("handle_reply result %d bytes %d\n", req->r_result,
- bytes);
- if (req->r_result == 0)
- req->r_result = bytes;
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
- /* in case this is a write and we need to replay, */
- req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
- req->r_reassert_version.version = cpu_to_le64(reassert_version);
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
+ if (!req) {
+ dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+ goto out_unlock_session;
+ }
- req->r_got_reply = 1;
- } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
- dout("handle_reply tid %llu dup ack\n", tid);
- goto out_unlock;
+ ret = decode_MOSDOpReply(msg, &m);
+ if (ret) {
+ pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+ req->r_tid, ret);
+ ceph_msg_dump(msg);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+ __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+ m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+ le64_to_cpu(m.replay_version.version), m.user_version);
+
+ if (m.retry_attempt >= 0) {
+ if (m.retry_attempt != req->r_attempts - 1) {
+ dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+ req, req->r_tid, m.retry_attempt,
+ req->r_attempts - 1);
+ goto out_unlock_session;
+ }
+ } else {
+ WARN_ON(1); /* MOSDOpReply v4 is assumed */
}
- dout("handle_reply tid %llu flags %d\n", tid, flags);
+ if (!ceph_oloc_empty(&m.redirect.oloc)) {
+ dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+ m.redirect.oloc.pool);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
- if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
- __register_linger_request(osdc, req);
+ if (m.num_ops != req->r_num_ops) {
+ pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+ req->r_num_ops, req->r_tid);
+ goto fail_request;
+ }
+ for (i = 0; i < req->r_num_ops; i++) {
+ dout(" req %p tid %llu op %d rval %d len %u\n", req,
+ req->r_tid, i, m.rval[i], m.outdata_len[i]);
+ req->r_ops[i].rval = m.rval[i];
+ req->r_ops[i].outdata_len = m.outdata_len[i];
+ data_len += m.outdata_len[i];
+ }
+ if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+ pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+ le32_to_cpu(msg->hdr.data_len), req->r_tid);
+ goto fail_request;
+ }
+ dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+ req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+ already_acked = req->r_got_reply;
+ if (!already_acked) {
+ req->r_result = m.result ?: data_len;
+ req->r_replay_version = m.replay_version; /* struct */
+ req->r_got_reply = true;
+ } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+ dout("req %p tid %llu dup ack\n", req, req->r_tid);
+ goto out_unlock_session;
+ }
- /* either this is a read, or we got the safe response */
- if (result < 0 ||
- (flags & CEPH_OSD_FLAG_ONDISK) ||
- ((flags & CEPH_OSD_FLAG_WRITE) == 0))
- __unregister_request(osdc, req);
+ if (done_request(req, &m)) {
+ __finish_request(req);
+ if (req->r_linger) {
+ WARN_ON(req->r_unsafe_callback);
+ dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+ __complete_request(req);
+ }
+ }
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
- if (!already_completed) {
- if (req->r_unsafe_callback &&
- result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+ if (done_request(req, &m)) {
+ if (already_acked && req->r_unsafe_callback) {
+ dout("req %p tid %llu safe-cb\n", req, req->r_tid);
+ req->r_unsafe_callback(req, false);
+ } else if (!req->r_linger) {
+ dout("req %p tid %llu cb\n", req, req->r_tid);
+ __complete_request(req);
+ }
+ if (m.flags & CEPH_OSD_FLAG_ONDISK)
+ complete_all(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+ } else {
+ if (req->r_unsafe_callback) {
+ dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
req->r_unsafe_callback(req, true);
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
+ } else {
+ WARN_ON(1);
+ }
}
- if (flags & CEPH_OSD_FLAG_ONDISK) {
- if (req->r_unsafe_callback && already_completed)
- req->r_unsafe_callback(req, false);
- complete_request(req);
+ return;
+
+fail_request:
+ complete_request(req, -EIO);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+}
+
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ pi->was_full = __pool_full(pi);
}
+}
-out:
- dout("req=%p req->r_linger=%d\n", req, req->r_linger);
- ceph_osdc_put_request(req);
- return;
-out_unlock:
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
- goto out;
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+ struct ceph_pg_pool_info *pi;
-bad_put:
- req->r_result = -EIO;
- __unregister_request(osdc, req);
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
- complete_request(req);
- ceph_osdc_put_request(req);
-bad_mutex:
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
-bad:
- pr_err("corrupt osd_op_reply got %d %d\n",
- (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
- ceph_msg_dump(msg);
+ pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+ if (!pi)
+ return false;
+
+ return pi->was_full && !__pool_full(pi);
}
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
{
- struct rb_node *p, *n;
+ struct ceph_osd_client *osdc = lreq->osdc;
+ enum calc_target_result ct_res;
- dout("%s %p\n", __func__, osdc);
- for (p = rb_first(&osdc->osds); p; p = n) {
- struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+ ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+ if (ct_res == CALC_TARGET_NEED_RESEND) {
+ struct ceph_osd *osd;
- n = rb_next(p);
- if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
- memcmp(&osd->o_con.peer_addr,
- ceph_osd_addr(osdc->osdmap,
- osd->o_osd),
- sizeof(struct ceph_entity_addr)) != 0)
- __reset_osd(osdc, osd);
+ osd = lookup_create_osd(osdc, lreq->t.osd, true);
+ if (osd != lreq->osd) {
+ unlink_linger(lreq->osd, lreq);
+ link_linger(osd, lreq);
+ }
}
+
+ return ct_res;
}
/*
- * Requeue requests whose mapping to an OSD has changed. If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
*/
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
- bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+ bool force_resend,
+ bool cleared_full,
+ bool check_pool_cleared_full,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
{
- struct ceph_osd_request *req, *nreq;
- struct rb_node *p;
- int needmap = 0;
- int err;
- bool force_resend_req;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct rb_node *n;
+ bool force_resend_writes;
+
+ for (n = rb_first(&osd->o_linger_requests); n; ) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* recalc_linger_target() */
+
+ dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+ lreq->linger_id);
+ ct_res = recalc_linger_target(lreq);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+ if (!force_resend && !force_resend_writes)
+ break;
+
+ /* fall through */
+ case CALC_TARGET_NEED_RESEND:
+ cancel_linger_map_check(lreq);
+ /*
+ * scan_requests() for the previous epoch(s)
+ * may have already added it to the list, since
+ * it's not unlinked here.
+ */
+ if (list_empty(&lreq->scan_item))
+ list_add_tail(&lreq->scan_item, need_resend_linger);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_linger_pool_dne(lreq);
+ break;
+ }
+ }
- dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
- force_resend_writes ? " (force resend writes)" : "");
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; ) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
- p = rb_next(p);
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ enum calc_target_result ct_res;
+
+ n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ ct_res = calc_target(osdc, &req->r_t,
+ &req->r_last_force_resend, false);
+ switch (ct_res) {
+ case CALC_TARGET_NO_ACTION:
+ force_resend_writes = cleared_full ||
+ (check_pool_cleared_full &&
+ pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+ if (!force_resend &&
+ (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+ !force_resend_writes))
+ break;
+
+ /* fall through */
+ case CALC_TARGET_NEED_RESEND:
+ cancel_map_check(req);
+ unlink_request(osd, req);
+ insert_request(need_resend, req);
+ break;
+ case CALC_TARGET_POOL_DNE:
+ check_pool_dne(req);
+ break;
+ }
+ }
+}
+static int handle_one_map(struct ceph_osd_client *osdc,
+ void *p, void *end, bool incremental,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osdmap *newmap;
+ struct rb_node *n;
+ bool skipped_map = false;
+ bool was_full;
+
+ was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ set_pool_was_full(osdc);
+
+ if (incremental)
+ newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+ else
+ newmap = ceph_osdmap_decode(&p, end);
+ if (IS_ERR(newmap))
+ return PTR_ERR(newmap);
+
+ if (newmap != osdc->osdmap) {
/*
- * For linger requests that have not yet been
- * registered, move them to the linger list; they'll
- * be sent to the osd in the loop below. Unregister
- * the request before re-registering it as a linger
- * request to ensure the __map_request() below
- * will decide it needs to be sent.
+ * Preserve ->was_full before destroying the old map.
+ * For pools that weren't in the old map, ->was_full
+ * should be false.
*/
- if (req->r_linger && list_empty(&req->r_linger_item)) {
- dout("%p tid %llu restart on osd%d\n",
- req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- ceph_osdc_get_request(req);
- __unregister_request(osdc, req);
- __register_linger_request(osdc, req);
- ceph_osdc_put_request(req);
- continue;
+ for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+ struct ceph_pg_pool_info *old_pi;
+
+ old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+ if (old_pi)
+ pi->was_full = old_pi->was_full;
+ else
+ WARN_ON(pi->was_full);
}
- force_resend_req = force_resend ||
- (force_resend_writes &&
- req->r_flags & CEPH_OSD_FLAG_WRITE);
- err = __map_request(osdc, req, force_resend_req);
- if (err < 0)
- continue; /* error */
- if (req->r_osd == NULL) {
- dout("%p tid %llu maps to no osd\n", req, req->r_tid);
- needmap++; /* request a newer map */
- } else if (err > 0) {
- if (!req->r_linger) {
- dout("%p tid %llu requeued on osd%d\n", req,
- req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- }
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 < newmap->epoch) {
+ WARN_ON(incremental);
+ skipped_map = true;
}
+
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
}
- list_for_each_entry_safe(req, nreq, &osdc->req_linger,
- r_linger_item) {
- dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
- err = __map_request(osdc, req,
- force_resend || force_resend_writes);
- dout("__map_request returned %d\n", err);
- if (err < 0)
- continue; /* hrm! */
- if (req->r_osd == NULL || err > 0) {
- if (req->r_osd == NULL) {
- dout("lingering %p tid %llu maps to no osd\n",
- req, req->r_tid);
- /*
- * A homeless lingering request makes
- * no sense, as it's job is to keep
- * a particular OSD connection open.
- * Request a newer map and kick the
- * request, knowing that it won't be
- * resent until we actually get a map
- * that can tell us where to send it.
- */
- needmap++;
- }
+ was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+ scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+ need_resend, need_resend_linger);
+
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n); /* close_osd() */
+
+ scan_requests(osd, skipped_map, was_full, true, need_resend,
+ need_resend_linger);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap, osd->o_osd),
+ sizeof(struct ceph_entity_addr)))
+ close_osd(osd);
+ }
- dout("kicking lingering %p tid %llu osd%d\n", req,
- req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
- __register_request(osdc, req);
- __unregister_linger_request(osdc, req);
+ return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct rb_root *need_resend,
+ struct list_head *need_resend_linger)
+{
+ struct ceph_osd_linger_request *lreq, *nlreq;
+ struct rb_node *n;
+
+ for (n = rb_first(need_resend); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
+ struct ceph_osd *osd;
+
+ n = rb_next(n);
+ erase_request(need_resend, req); /* before link_request() */
+
+ WARN_ON(req->r_osd);
+ calc_target(osdc, &req->r_t, NULL, false);
+ osd = lookup_create_osd(osdc, req->r_t.osd, true);
+ link_request(osd, req);
+ if (!req->r_linger) {
+ if (!osd_homeless(osd) && !req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
}
}
- reset_changed_osds(osdc);
- mutex_unlock(&osdc->request_mutex);
- if (needmap) {
- dout("%d requests for down osds, need new map\n", needmap);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
+ list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+ if (!osd_homeless(lreq->osd))
+ send_linger(lreq);
+
+ list_del_init(&lreq->scan_item);
}
}
-
/*
* Process updated osd map.
*
@@ -2115,27 +3152,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
*/
void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
{
- void *p, *end, *next;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
u32 nr_maps, maplen;
u32 epoch;
- struct ceph_osdmap *newmap = NULL, *oldmap;
- int err;
struct ceph_fsid fsid;
- bool was_full;
+ struct rb_root need_resend = RB_ROOT;
+ LIST_HEAD(need_resend_linger);
+ bool handled_incremental = false;
+ bool was_pauserd, was_pausewr;
+ bool pauserd, pausewr;
+ int err;
- dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+ down_write(&osdc->lock);
/* verify fsid */
ceph_decode_need(&p, end, sizeof(fsid), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
if (ceph_check_fsid(osdc->client, &fsid) < 0)
- return;
-
- down_write(&osdc->map_sem);
+ goto bad;
- was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+ was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
/* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3186,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
ceph_decode_need(&p, end, maplen, bad);
- next = p + maplen;
- if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ if (osdc->osdmap->epoch &&
+ osdc->osdmap->epoch + 1 == epoch) {
dout("applying incremental map %u len %d\n",
epoch, maplen);
- newmap = osdmap_apply_incremental(&p, next,
- osdc->osdmap,
- &osdc->client->msgr);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
+ err = handle_one_map(osdc, p, p + maplen, true,
+ &need_resend, &need_resend_linger);
+ if (err)
goto bad;
- }
- BUG_ON(!newmap);
- if (newmap != osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = newmap;
- }
- was_full = was_full ||
- ceph_osdmap_flag(osdc->osdmap,
- CEPH_OSDMAP_FULL);
- kick_requests(osdc, 0, was_full);
+ handled_incremental = true;
} else {
dout("ignoring incremental map %u len %d\n",
epoch, maplen);
}
- p = next;
+ p += maplen;
nr_maps--;
}
- if (newmap)
+ if (handled_incremental)
goto done;
/* full maps */
@@ -2186,455 +3216,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
if (nr_maps > 1) {
dout("skipping non-latest full map %u len %d\n",
epoch, maplen);
- } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ } else if (osdc->osdmap->epoch >= epoch) {
dout("skipping full map %u len %d, "
"older than our %u\n", epoch, maplen,
osdc->osdmap->epoch);
} else {
- int skipped_map = 0;
-
dout("taking full map %u len %d\n", epoch, maplen);
- newmap = ceph_osdmap_decode(&p, p+maplen);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
+ err = handle_one_map(osdc, p, p + maplen, false,
+ &need_resend, &need_resend_linger);
+ if (err)
goto bad;
- }
- BUG_ON(!newmap);
- oldmap = osdc->osdmap;
- osdc->osdmap = newmap;
- if (oldmap) {
- if (oldmap->epoch + 1 < newmap->epoch)
- skipped_map = 1;
- ceph_osdmap_destroy(oldmap);
- }
- was_full = was_full ||
- ceph_osdmap_flag(osdc->osdmap,
- CEPH_OSDMAP_FULL);
- kick_requests(osdc, skipped_map, was_full);
}
p += maplen;
nr_maps--;
}
- if (!osdc->osdmap)
- goto bad;
done:
- downgrade_write(&osdc->map_sem);
- ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
- osdc->osdmap->epoch);
-
/*
* subscribe to subsequent osdmap updates if full to ensure
* we find out when we are no longer full and stop returning
* ENOSPC.
*/
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
- ceph_monc_request_next_osdmap(&osdc->client->monc);
-
- mutex_lock(&osdc->request_mutex);
- __send_queued(osdc);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+ pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ have_pool_full(osdc);
+ if (was_pauserd || was_pausewr || pauserd || pausewr)
+ maybe_request_map(osdc);
+
+ kick_requests(osdc, &need_resend, &need_resend_linger);
+
+ ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch);
+ up_write(&osdc->lock);
wake_up_all(&osdc->client->auth_wq);
return;
bad:
pr_err("osdc handle_map corrupt msg\n");
ceph_msg_dump(msg);
- up_write(&osdc->map_sem);
+ up_write(&osdc->lock);
}
/*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
+ * Resubmit requests pending on the given osd.
*/
-static void __release_event(struct kref *kref)
+static void kick_osd_requests(struct ceph_osd *osd)
{
- struct ceph_osd_event *event =
- container_of(kref, struct ceph_osd_event, kref);
+ struct rb_node *n;
- dout("__release_event %p\n", event);
- kfree(event);
-}
+ for (n = rb_first(&osd->o_requests); n; ) {
+ struct ceph_osd_request *req =
+ rb_entry(n, struct ceph_osd_request, r_node);
-static void get_event(struct ceph_osd_event *event)
-{
- kref_get(&event->kref);
-}
+ n = rb_next(n); /* cancel_linger_request() */
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
- kref_put(&event->kref, __release_event);
+ if (!req->r_linger) {
+ if (!req->r_t.paused)
+ send_request(req);
+ } else {
+ cancel_linger_request(req);
+ }
+ }
+ for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+ struct ceph_osd_linger_request *lreq =
+ rb_entry(n, struct ceph_osd_linger_request, node);
+
+ send_linger(lreq);
+ }
}
-EXPORT_SYMBOL(ceph_osdc_put_event);
-static void __insert_event(struct ceph_osd_client *osdc,
- struct ceph_osd_event *new)
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
{
- struct rb_node **p = &osdc->event_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_event *event = NULL;
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
- while (*p) {
- parent = *p;
- event = rb_entry(parent, struct ceph_osd_event, node);
- if (new->cookie < event->cookie)
- p = &(*p)->rb_left;
- else if (new->cookie > event->cookie)
- p = &(*p)->rb_right;
- else
- BUG();
+ dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+ down_write(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown\n", __func__, osd->o_osd);
+ goto out_unlock;
}
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &osdc->event_tree);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ maybe_request_map(osdc);
+
+out_unlock:
+ up_write(&osdc->lock);
}
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
- u64 cookie)
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg)
{
- struct rb_node **p = &osdc->event_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_event *event = NULL;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front.iov_len;
+ struct ceph_osd_linger_request *lreq;
+ struct linger_work *lwork;
+ u8 proto_ver, opcode;
+ u64 cookie, notify_id;
+ u64 notifier_id = 0;
+ s32 return_code = 0;
+ void *payload = NULL;
+ u32 payload_len = 0;
- while (*p) {
- parent = *p;
- event = rb_entry(parent, struct ceph_osd_event, node);
- if (cookie < event->cookie)
- p = &(*p)->rb_left;
- else if (cookie > event->cookie)
- p = &(*p)->rb_right;
- else
- return event;
+ ceph_decode_8_safe(&p, end, proto_ver, bad);
+ ceph_decode_8_safe(&p, end, opcode, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+ p += 8; /* skip ver */
+ ceph_decode_64_safe(&p, end, notify_id, bad);
+
+ if (proto_ver >= 1) {
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+ payload = p;
+ p += payload_len;
}
- return NULL;
-}
-static void __remove_event(struct ceph_osd_event *event)
-{
- struct ceph_osd_client *osdc = event->osdc;
+ if (le16_to_cpu(msg->hdr.version) >= 2)
+ ceph_decode_32_safe(&p, end, return_code, bad);
+
+ if (le16_to_cpu(msg->hdr.version) >= 3)
+ ceph_decode_64_safe(&p, end, notifier_id, bad);
- if (!RB_EMPTY_NODE(&event->node)) {
- dout("__remove_event removed %p\n", event);
- rb_erase(&event->node, &osdc->event_tree);
- ceph_osdc_put_event(event);
+ down_read(&osdc->lock);
+ lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+ if (!lreq) {
+ dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+ cookie);
+ goto out_unlock_osdc;
+ }
+
+ mutex_lock(&lreq->lock);
+ dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+ opcode, cookie, lreq, lreq->is_watch);
+ if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+ if (!lreq->last_error) {
+ lreq->last_error = -ENOTCONN;
+ queue_watch_error(lreq);
+ }
+ } else if (!lreq->is_watch) {
+ /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+ if (lreq->notify_id && lreq->notify_id != notify_id) {
+ dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+ lreq->notify_id, notify_id);
+ } else if (!completion_done(&lreq->notify_finish_wait)) {
+ struct ceph_msg_data *data =
+ list_first_entry_or_null(&msg->data,
+ struct ceph_msg_data,
+ links);
+
+ if (data) {
+ if (lreq->preply_pages) {
+ WARN_ON(data->type !=
+ CEPH_MSG_DATA_PAGES);
+ *lreq->preply_pages = data->pages;
+ *lreq->preply_len = data->length;
+ } else {
+ ceph_release_page_vector(data->pages,
+ calc_pages_for(0, data->length));
+ }
+ }
+ lreq->notify_finish_error = return_code;
+ complete_all(&lreq->notify_finish_wait);
+ }
} else {
- dout("__remove_event didn't remove %p\n", event);
+ /* CEPH_WATCH_EVENT_NOTIFY */
+ lwork = lwork_alloc(lreq, do_watch_notify);
+ if (!lwork) {
+ pr_err("failed to allocate notify-lwork\n");
+ goto out_unlock_lreq;
+ }
+
+ lwork->notify.notify_id = notify_id;
+ lwork->notify.notifier_id = notifier_id;
+ lwork->notify.payload = payload;
+ lwork->notify.payload_len = payload_len;
+ lwork->notify.msg = ceph_msg_get(msg);
+ lwork_queue(lwork);
}
+
+out_unlock_lreq:
+ mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return;
+
+bad:
+ pr_err("osdc handle_watch_notify corrupt msg\n");
}
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
- void (*event_cb)(u64, u64, u8, void *),
- void *data, struct ceph_osd_event **pevent)
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
{
- struct ceph_osd_event *event;
+ down_read(&osdc->lock);
+ submit_request(req, false);
+ up_read(&osdc->lock);
- event = kmalloc(sizeof(*event), GFP_NOIO);
- if (!event)
- return -ENOMEM;
-
- dout("create_event %p\n", event);
- event->cb = event_cb;
- event->one_shot = 0;
- event->data = data;
- event->osdc = osdc;
- INIT_LIST_HEAD(&event->osd_node);
- RB_CLEAR_NODE(&event->node);
- kref_init(&event->kref); /* one ref for us */
- kref_get(&event->kref); /* one ref for the caller */
-
- spin_lock(&osdc->event_lock);
- event->cookie = ++osdc->event_count;
- __insert_event(osdc, event);
- spin_unlock(&osdc->event_lock);
-
- *pevent = event;
return 0;
}
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_start_request);
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request. The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
{
- struct ceph_osd_client *osdc = event->osdc;
+ struct ceph_osd_client *osdc = req->r_osdc;
- dout("cancel_event %p\n", event);
- spin_lock(&osdc->event_lock);
- __remove_event(event);
- spin_unlock(&osdc->event_lock);
- ceph_osdc_put_event(event); /* caller's */
+ down_write(&osdc->lock);
+ if (req->r_osd)
+ cancel_request(req);
+ up_write(&osdc->lock);
}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
-
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
-static void do_event_work(struct work_struct *work)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+ unsigned long timeout)
{
- struct ceph_osd_event_work *event_work =
- container_of(work, struct ceph_osd_event_work, work);
- struct ceph_osd_event *event = event_work->event;
- u64 ver = event_work->ver;
- u64 notify_id = event_work->notify_id;
- u8 opcode = event_work->opcode;
+ long left;
+
+ dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ left = wait_for_completion_killable_timeout(&req->r_completion,
+ ceph_timeout_jiffies(timeout));
+ if (left <= 0) {
+ left = left ?: -ETIMEDOUT;
+ ceph_osdc_cancel_request(req);
- dout("do_event_work completing %p\n", event);
- event->cb(ver, notify_id, opcode, event->data);
- dout("do_event_work completed %p\n", event);
- ceph_osdc_put_event(event);
- kfree(event_work);
+ /* kludge - need to to wake ceph_osdc_sync() */
+ complete_all(&req->r_safe_completion);
+ } else {
+ left = req->r_result; /* completed */
+ }
+
+ return left;
}
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
- * Process osd watch notifications
+ * sync - wait for all in-flight requests to flush. avoid starvation.
*/
-static void handle_watch_notify(struct ceph_osd_client *osdc,
- struct ceph_msg *msg)
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
{
- void *p, *end;
- u8 proto_ver;
- u64 cookie, ver, notify_id;
- u8 opcode;
- struct ceph_osd_event *event;
- struct ceph_osd_event_work *event_work;
+ struct rb_node *n, *p;
+ u64 last_tid = atomic64_read(&osdc->last_tid);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+again:
+ down_read(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
- ceph_decode_8_safe(&p, end, proto_ver, bad);
- ceph_decode_8_safe(&p, end, opcode, bad);
- ceph_decode_64_safe(&p, end, cookie, bad);
- ceph_decode_64_safe(&p, end, ver, bad);
- ceph_decode_64_safe(&p, end, notify_id, bad);
+ mutex_lock(&osd->lock);
+ for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req =
+ rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_tid > last_tid)
+ break;
- spin_lock(&osdc->event_lock);
- event = __find_event(osdc, cookie);
- if (event) {
- BUG_ON(event->one_shot);
- get_event(event);
- }
- spin_unlock(&osdc->event_lock);
- dout("handle_watch_notify cookie %lld ver %lld event %p\n",
- cookie, ver, event);
- if (event) {
- event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
- if (!event_work) {
- pr_err("couldn't allocate event_work\n");
- ceph_osdc_put_event(event);
- return;
+ if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osd->lock);
+ up_read(&osdc->lock);
+ dout("%s waiting on req %p tid %llu last_tid %llu\n",
+ __func__, req, req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+ goto again;
}
- INIT_WORK(&event_work->work, do_event_work);
- event_work->event = event;
- event_work->ver = ver;
- event_work->notify_id = notify_id;
- event_work->opcode = opcode;
- queue_work(osdc->notify_wq, &event_work->work);
+ mutex_unlock(&osd->lock);
}
- return;
+ up_read(&osdc->lock);
+ dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
-bad:
- pr_err("osdc handle_watch_notify corrupt msg\n");
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+ struct ceph_osd_request *req;
+
+ req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return NULL;
+
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+
+ if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+
+ return req;
}
/*
- * build new request AND message
- *
+ * Returns a handle, caller owns a ref.
*/
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
- struct ceph_snap_context *snapc, u64 snap_id,
- struct timespec *mtime)
-{
- struct ceph_msg *msg = req->r_request;
- void *p;
- size_t msg_size;
- int flags = req->r_flags;
- u64 data_len;
- unsigned int i;
-
- req->r_snapid = snap_id;
- req->r_snapc = ceph_get_snap_context(snapc);
-
- /* encode request */
- msg->hdr.version = cpu_to_le16(4);
-
- p = msg->front.iov_base;
- ceph_encode_32(&p, 1); /* client_inc is always 1 */
- req->r_request_osdmap_epoch = p;
- p += 4;
- req->r_request_flags = p;
- p += 4;
- if (req->r_flags & CEPH_OSD_FLAG_WRITE)
- ceph_encode_timespec(p, mtime);
- p += sizeof(struct ceph_timespec);
- req->r_request_reassert_version = p;
- p += sizeof(struct ceph_eversion); /* will get filled in */
-
- /* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
- req->r_request_pool = p;
- p += 8;
- ceph_encode_32(&p, -1); /* preferred */
- ceph_encode_32(&p, 0); /* key len */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data)
+{
+ struct ceph_osd_linger_request *lreq;
+ int ret;
- ceph_encode_8(&p, 1);
- req->r_request_pgid = p;
- p += 8 + 4;
- ceph_encode_32(&p, -1); /* preferred */
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return ERR_PTR(-ENOMEM);
- /* oid */
- ceph_encode_32(&p, req->r_base_oid.name_len);
- memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
- dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
- req->r_base_oid.name, req->r_base_oid.name_len);
- p += req->r_base_oid.name_len;
-
- /* ops--can imply data */
- ceph_encode_16(&p, (u16)req->r_num_ops);
- data_len = 0;
- for (i = 0; i < req->r_num_ops; i++) {
- data_len += osd_req_encode_op(req, p, i);
- p += sizeof(struct ceph_osd_op);
+ lreq->is_watch = true;
+ lreq->wcb = wcb;
+ lreq->errcb = errcb;
+ lreq->data = data;
+ lreq->watch_valid_thru = jiffies;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ lreq->mtime = CURRENT_TIME;
+
+ lreq->reg_req = alloc_linger_request(lreq);
+ if (!lreq->reg_req) {
+ ret = -ENOMEM;
+ goto err_put_lreq;
}
- /* snaps */
- ceph_encode_64(&p, req->r_snapid);
- ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
- ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
- if (req->r_snapc) {
- for (i = 0; i < snapc->num_snaps; i++) {
- ceph_encode_64(&p, req->r_snapc->snaps[i]);
- }
+ lreq->ping_req = alloc_linger_request(lreq);
+ if (!lreq->ping_req) {
+ ret = -ENOMEM;
+ goto err_put_lreq;
}
- req->r_request_attempts = p;
- p += 4;
-
- /* data */
- if (flags & CEPH_OSD_FLAG_WRITE) {
- u16 data_off;
-
- /*
- * The header "data_off" is a hint to the receiver
- * allowing it to align received data into its
- * buffers such that there's no need to re-copy
- * it before writing it to disk (direct I/O).
- */
- data_off = (u16) (off & 0xffff);
- req->r_request->hdr.data_off = cpu_to_le16(data_off);
+ down_write(&osdc->lock);
+ linger_register(lreq); /* before osd_req_op_* */
+ osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_WATCH);
+ osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_PING);
+ linger_submit(lreq);
+ up_write(&osdc->lock);
+
+ ret = linger_reg_commit_wait(lreq);
+ if (ret) {
+ linger_cancel(lreq);
+ goto err_put_lreq;
}
- req->r_request->hdr.data_len = cpu_to_le32(data_len);
- BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
- msg_size = p - msg->front.iov_base;
- msg->front.iov_len = msg_size;
- msg->hdr.front_len = cpu_to_le32(msg_size);
+ return lreq;
- dout("build_request msg_size was %d\n", (int)msg_size);
+err_put_lreq:
+ linger_put(lreq);
+ return ERR_PTR(ret);
}
-EXPORT_SYMBOL(ceph_osdc_build_request);
+EXPORT_SYMBOL(ceph_osdc_watch);
/*
- * Register request, send initial attempt.
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
*/
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
{
- int rc;
+ struct ceph_options *opts = osdc->client->options;
+ struct ceph_osd_request *req;
+ int ret;
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
- rc = __ceph_osdc_start_request(osdc, req, nofail);
+ ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ req->r_mtime = CURRENT_TIME;
+ osd_req_op_watch_init(req, 0, lreq->linger_id,
+ CEPH_OSD_WATCH_OP_UNWATCH);
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
- return rc;
+ ceph_osdc_start_request(osdc, req, false);
+ linger_cancel(lreq);
+ linger_put(lreq);
+ ret = wait_request_timeout(req, opts->mount_timeout);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
}
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_unwatch);
-/*
- * Unregister a registered request. The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+ u64 notify_id, u64 cookie, void *payload,
+ size_t payload_len)
{
- struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
+
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
- mutex_lock(&osdc->request_mutex);
- if (req->r_linger)
- __unregister_linger_request(osdc, req);
- __unregister_request(osdc, req);
- mutex_unlock(&osdc->request_mutex);
+ ceph_pagelist_init(pl);
+ ret = ceph_pagelist_encode_64(pl, notify_id);
+ ret |= ceph_pagelist_encode_64(pl, cookie);
+ if (payload) {
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ } else {
+ ret |= ceph_pagelist_encode_32(pl, 0);
+ }
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
+ }
- dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+ ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ size_t payload_len)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ ceph_oid_copy(&req->r_base_oid, oid);
+ ceph_oloc_copy(&req->r_base_oloc, oloc);
+ req->r_flags = CEPH_OSD_FLAG_READ;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+ payload_len);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+out_put_req:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+ u64 cookie, u32 prot_ver, u32 timeout,
+ void *payload, size_t payload_len)
{
- int rc;
+ struct ceph_osd_req_op *op;
+ struct ceph_pagelist *pl;
+ int ret;
- dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+ op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+ op->notify.cookie = cookie;
- rc = wait_for_completion_interruptible(&req->r_completion);
- if (rc < 0) {
- dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
- ceph_osdc_cancel_request(req);
- complete_request(req);
- return rc;
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl)
+ return -ENOMEM;
+
+ ceph_pagelist_init(pl);
+ ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+ ret |= ceph_pagelist_encode_32(pl, timeout);
+ ret |= ceph_pagelist_encode_32(pl, payload_len);
+ ret |= ceph_pagelist_append(pl, payload, payload_len);
+ if (ret) {
+ ceph_pagelist_release(pl);
+ return -ENOMEM;
}
- dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
- req->r_result);
- return req->r_result;
+ ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+ op->indata_len = pl->length;
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_wait_request);
/*
- * sync - wait for all in-flight requests to flush. avoid starvation.
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
*/
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ size_t payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len)
{
- struct ceph_osd_request *req;
- u64 last_tid, next_tid = 0;
+ struct ceph_osd_linger_request *lreq;
+ struct page **pages;
+ int ret;
- mutex_lock(&osdc->request_mutex);
- last_tid = osdc->last_tid;
- while (1) {
- req = __lookup_request_ge(osdc, next_tid);
- if (!req)
- break;
- if (req->r_tid > last_tid)
- break;
+ WARN_ON(!timeout);
+ if (preply_pages) {
+ *preply_pages = NULL;
+ *preply_len = 0;
+ }
- next_tid = req->r_tid + 1;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
- continue;
+ lreq = linger_alloc(osdc);
+ if (!lreq)
+ return -ENOMEM;
- ceph_osdc_get_request(req);
- mutex_unlock(&osdc->request_mutex);
- dout("sync waiting on tid %llu (last is %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&osdc->request_mutex);
- ceph_osdc_put_request(req);
+ lreq->preply_pages = preply_pages;
+ lreq->preply_len = preply_len;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+ lreq->reg_req = alloc_linger_request(lreq);
+ if (!lreq->reg_req) {
+ ret = -ENOMEM;
+ goto out_put_lreq;
}
- mutex_unlock(&osdc->request_mutex);
- dout("sync done (thru tid %llu)\n", last_tid);
+
+ /* for notify_id */
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ goto out_put_lreq;
+ }
+
+ down_write(&osdc->lock);
+ linger_register(lreq); /* before osd_req_op_* */
+ ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+ timeout, payload, payload_len);
+ if (ret) {
+ linger_unregister(lreq);
+ up_write(&osdc->lock);
+ ceph_release_page_vector(pages, 1);
+ goto out_put_lreq;
+ }
+ ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+ response_data),
+ pages, PAGE_SIZE, 0, false, true);
+ linger_submit(lreq);
+ up_write(&osdc->lock);
+
+ ret = linger_reg_commit_wait(lreq);
+ if (!ret)
+ ret = linger_notify_finish_wait(lreq);
+ else
+ dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+
+ linger_cancel(lreq);
+out_put_lreq:
+ linger_put(lreq);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error. If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq)
+{
+ unsigned long stamp, age;
+ int ret;
+
+ down_read(&osdc->lock);
+ mutex_lock(&lreq->lock);
+ stamp = lreq->watch_valid_thru;
+ if (!list_empty(&lreq->pending_lworks)) {
+ struct linger_work *lwork =
+ list_first_entry(&lreq->pending_lworks,
+ struct linger_work,
+ pending_item);
+
+ if (time_before(lwork->queued_stamp, stamp))
+ stamp = lwork->queued_stamp;
+ }
+ age = jiffies - stamp;
+ dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+ lreq, lreq->linger_id, age, lreq->last_error);
+ /* we are truncating to msecs, so return a safe upper bound */
+ ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+ mutex_unlock(&lreq->lock);
+ up_read(&osdc->lock);
+ return ret;
}
-EXPORT_SYMBOL(ceph_osdc_sync);
/*
* Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3868,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
}
EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+ down_read(&osdc->lock);
+ maybe_request_map(osdc);
+ up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
/*
* init, shutdown
@@ -2656,43 +3885,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
dout("init\n");
osdc->client = client;
- osdc->osdmap = NULL;
- init_rwsem(&osdc->map_sem);
- init_completion(&osdc->map_waiters);
- osdc->last_requested_map = 0;
- mutex_init(&osdc->request_mutex);
- osdc->last_tid = 0;
+ init_rwsem(&osdc->lock);
osdc->osds = RB_ROOT;
INIT_LIST_HEAD(&osdc->osd_lru);
- osdc->requests = RB_ROOT;
- INIT_LIST_HEAD(&osdc->req_lru);
- INIT_LIST_HEAD(&osdc->req_unsent);
- INIT_LIST_HEAD(&osdc->req_notarget);
- INIT_LIST_HEAD(&osdc->req_linger);
- osdc->num_requests = 0;
+ spin_lock_init(&osdc->osd_lru_lock);
+ osd_init(&osdc->homeless_osd);
+ osdc->homeless_osd.o_osdc = osdc;
+ osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+ osdc->linger_requests = RB_ROOT;
+ osdc->map_checks = RB_ROOT;
+ osdc->linger_map_checks = RB_ROOT;
INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
- spin_lock_init(&osdc->event_lock);
- osdc->event_tree = RB_ROOT;
- osdc->event_count = 0;
-
- schedule_delayed_work(&osdc->osds_timeout_work,
- round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM;
+ osdc->osdmap = ceph_osdmap_alloc();
+ if (!osdc->osdmap)
+ goto out;
+
osdc->req_mempool = mempool_create_slab_pool(10,
ceph_osd_request_cache);
if (!osdc->req_mempool)
- goto out;
+ goto out_map;
err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
- OSD_OP_FRONT_LEN, 10, true,
- "osd_op");
+ PAGE_SIZE, 10, true, "osd_op");
if (err < 0)
goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
- OSD_OPREPLY_FRONT_LEN, 10, true,
- "osd_op_reply");
+ PAGE_SIZE, 10, true, "osd_op_reply");
if (err < 0)
goto out_msgpool;
@@ -2701,6 +3922,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->notify_wq)
goto out_msgpool_reply;
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout);
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
return 0;
out_msgpool_reply:
@@ -2709,6 +3935,8 @@ out_msgpool:
ceph_msgpool_destroy(&osdc->msgpool_op);
out_mempool:
mempool_destroy(osdc->req_mempool);
+out_map:
+ ceph_osdmap_destroy(osdc->osdmap);
out:
return err;
}
@@ -2719,11 +3947,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
destroy_workqueue(osdc->notify_wq);
cancel_delayed_work_sync(&osdc->timeout_work);
cancel_delayed_work_sync(&osdc->osds_timeout_work);
- if (osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = NULL;
+
+ down_write(&osdc->lock);
+ while (!RB_EMPTY_ROOT(&osdc->osds)) {
+ struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+ struct ceph_osd, o_node);
+ close_osd(osd);
}
- remove_all_osds(osdc);
+ up_write(&osdc->lock);
+ WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+ osd_cleanup(&osdc->homeless_osd);
+
+ WARN_ON(!list_empty(&osdc->osd_lru));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+ WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+ WARN_ON(atomic_read(&osdc->num_requests));
+ WARN_ON(atomic_read(&osdc->num_homeless));
+
+ ceph_osdmap_destroy(osdc->osdmap);
mempool_destroy(osdc->req_mempool);
ceph_msgpool_destroy(&osdc->msgpool_op);
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3994,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
return PTR_ERR(req);
/* it may be a short read due to an object boundary */
-
osd_req_op_extent_osd_data_pages(req, 0,
pages, *plen, page_align, false, false);
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
off, *plen, *plen, page_align);
- ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
rc = ceph_osdc_start_request(osdc, req, false);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4025,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
int rc = 0;
int page_align = off & ~PAGE_MASK;
- BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4038,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
false, false);
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
- ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+ req->r_mtime = *mtime;
rc = ceph_osdc_start_request(osdc, req, true);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4078,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
+ struct ceph_osd_client *osdc = osd->o_osdc;
int type = le16_to_cpu(msg->hdr.type);
- if (!osd)
- goto out;
- osdc = osd->o_osdc;
-
switch (type) {
case CEPH_MSG_OSD_MAP:
ceph_osdc_handle_map(osdc, msg);
break;
case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg);
+ handle_reply(osd, msg);
break;
case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg);
@@ -2863,7 +4096,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
pr_err("received unknown message type %d %s\n", type,
ceph_msg_type_name(type));
}
-out:
+
ceph_msg_put(msg);
}
@@ -2878,21 +4111,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
{
struct ceph_osd *osd = con->private;
struct ceph_osd_client *osdc = osd->o_osdc;
- struct ceph_msg *m;
+ struct ceph_msg *m = NULL;
struct ceph_osd_request *req;
int front_len = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
- u64 tid;
+ u64 tid = le64_to_cpu(hdr->tid);
- tid = le64_to_cpu(hdr->tid);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
+ down_read(&osdc->lock);
+ if (!osd_registered(osd)) {
+ dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+ *skip = 1;
+ goto out_unlock_osdc;
+ }
+ WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+ mutex_lock(&osd->lock);
+ req = lookup_request(&osd->o_requests, tid);
if (!req) {
dout("%s osd%d tid %llu unknown, skipping\n", __func__,
osd->o_osd, tid);
- m = NULL;
*skip = 1;
- goto out;
+ goto out_unlock_session;
}
ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4143,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
false);
if (!m)
- goto out;
+ goto out_unlock_session;
ceph_msg_put(req->r_reply);
req->r_reply = m;
}
@@ -2915,14 +4154,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply->data_length);
m = NULL;
*skip = 1;
- goto out;
+ goto out_unlock_session;
}
m = ceph_msg_get(req->r_reply);
dout("get_reply tid %lld %p\n", tid, m);
-out:
- mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+ mutex_unlock(&osd->lock);
+out_unlock_osdc:
+ up_read(&osdc->lock);
+ return m;
+}
+
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+ struct ceph_msg *m;
+ int type = le16_to_cpu(hdr->type);
+ u32 front_len = le32_to_cpu(hdr->front_len);
+ u32 data_len = le32_to_cpu(hdr->data_len);
+
+ m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+ if (!m)
+ return NULL;
+
+ if (data_len) {
+ struct page **pages;
+ struct ceph_osd_data osd_data;
+
+ pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+ GFP_NOIO);
+ if (!pages) {
+ ceph_msg_put(m);
+ return NULL;
+ }
+
+ ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+ false);
+ ceph_osdc_msg_data_add(m, &osd_data);
+ }
+
return m;
}
@@ -2932,18 +4206,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
{
struct ceph_osd *osd = con->private;
int type = le16_to_cpu(hdr->type);
- int front = le32_to_cpu(hdr->front_len);
*skip = 0;
switch (type) {
case CEPH_MSG_OSD_MAP:
case CEPH_MSG_WATCH_NOTIFY:
- return ceph_msg_new(type, front, GFP_NOFS, false);
+ return alloc_msg_with_page_vector(hdr);
case CEPH_MSG_OSD_OPREPLY:
return get_reply(con, hdr, skip);
default:
- pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
- osd->o_osd);
+ pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+ osd->o_osd, type);
*skip = 1;
return NULL;
}
@@ -3047,5 +4320,5 @@ static const struct ceph_connection_operations osd_con_ops = {
.alloc_msg = alloc_msg,
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
- .fault = osd_reset,
+ .fault = osd_fault,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..03062bb763b3 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
return ERR_PTR(err);
}
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
{
- if (l.pool < r.pool)
+ if (lhs->pool < rhs->pool)
return -1;
- if (l.pool > r.pool)
+ if (lhs->pool > rhs->pool)
return 1;
- if (l.seed < r.seed)
+ if (lhs->seed < rhs->seed)
return -1;
- if (l.seed > r.seed)
+ if (lhs->seed > rhs->seed)
return 1;
+
return 0;
}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
static int __insert_pg_mapping(struct ceph_pg_mapping *new,
struct rb_root *root)
{
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
while (*p) {
parent = *p;
pg = rb_entry(parent, struct ceph_pg_mapping, node);
- c = pgid_cmp(new->pgid, pg->pgid);
+ c = ceph_pg_compare(&new->pgid, &pg->pgid);
if (c < 0)
p = &(*p)->rb_left;
else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
while (n) {
pg = rb_entry(n, struct ceph_pg_mapping, node);
- c = pgid_cmp(pgid, pg->pgid);
+ c = ceph_pg_compare(&pgid, &pg->pgid);
if (c < 0) {
n = n->rb_left;
} else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += 4; /* skip crash_replay_interval */
if (ev >= 7)
- *p += 1; /* skip min_size */
+ pi->min_size = ceph_decode_8(p);
+ else
+ pi->min_size = pi->size - pi->size / 2;
if (ev >= 8)
*p += 8 + 8; /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
pi->write_tier = -1;
}
+ if (ev >= 10) {
+ /* skip properties */
+ num = ceph_decode_32(p);
+ while (num--) {
+ len = ceph_decode_32(p);
+ *p += len; /* key */
+ len = ceph_decode_32(p);
+ *p += len; /* val */
+ }
+ }
+
+ if (ev >= 11) {
+ /* skip hit_set_params */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+
+ *p += 4; /* skip hit_set_period */
+ *p += 4; /* skip hit_set_count */
+ }
+
+ if (ev >= 12)
+ *p += 4; /* skip stripe_width */
+
+ if (ev >= 13) {
+ *p += 8; /* skip target_max_bytes */
+ *p += 8; /* skip target_max_objects */
+ *p += 4; /* skip cache_target_dirty_ratio_micro */
+ *p += 4; /* skip cache_target_full_ratio_micro */
+ *p += 4; /* skip cache_min_flush_age */
+ *p += 4; /* skip cache_min_evict_age */
+ }
+
+ if (ev >= 14) {
+ /* skip erasure_code_profile */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ if (ev >= 15)
+ pi->last_force_request_resend = ceph_decode_32(p);
+ else
+ pi->last_force_request_resend = 0;
+
/* ignore the rest */
*p = pool_end;
@@ -660,6 +707,23 @@ bad:
/*
* osd map
*/
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+ struct ceph_osdmap *map;
+
+ map = kzalloc(sizeof(*map), GFP_NOIO);
+ if (!map)
+ return NULL;
+
+ map->pg_pools = RB_ROOT;
+ map->pool_max = -1;
+ map->pg_temp = RB_ROOT;
+ map->primary_temp = RB_ROOT;
+ mutex_init(&map->crush_scratch_mutex);
+
+ return map;
+}
+
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
struct ceph_osdmap *map;
int ret;
- map = kzalloc(sizeof(*map), GFP_NOFS);
+ map = ceph_osdmap_alloc();
if (!map)
return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
- map->primary_temp = RB_ROOT;
- mutex_init(&map->crush_scratch_mutex);
-
ret = osdmap_decode(p, end, map);
if (ret) {
ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr)
+ struct ceph_osdmap *map)
{
struct crush_map *newcrush = NULL;
struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
return ERR_PTR(err);
}
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src)
+{
+ WARN_ON(!ceph_oid_empty(dest));
+
+ if (src->name != src->inline_name) {
+ /* very rare, see ceph_object_id definition */
+ dest->name = kmalloc(src->name_len + 1,
+ GFP_NOIO | __GFP_NOFAIL);
+ }
+
+ memcpy(dest->name, src->name, src->name_len + 1);
+ dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+ int len;
+
+ WARN_ON(!ceph_oid_empty(oid));
+
+ len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+ if (len >= sizeof(oid->inline_name))
+ return len;
+
+ oid->name_len = len;
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ BUG_ON(oid_printf_vargs(oid, fmt, ap));
+ va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, va_list ap)
+{
+ va_list aq;
+ int len;
+
+ va_copy(aq, ap);
+ len = oid_printf_vargs(oid, fmt, aq);
+ va_end(aq);
+
+ if (len) {
+ char *external_name;
+
+ external_name = kmalloc(len + 1, gfp);
+ if (!external_name)
+ return -ENOMEM;
+
+ oid->name = external_name;
+ WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+ oid->name_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+ if (oid->name != oid->inline_name)
+ kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (lhs->size == rhs->size &&
+ !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+ return true;
+
+ return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+ const struct ceph_osds *rhs)
+{
+ if (__osds_equal(lhs, rhs) &&
+ lhs->primary == rhs->primary)
+ return true;
+
+ return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+ /* non-empty set */
+ if (set->size > 0 && set->primary >= 0)
+ return true;
+
+ /* empty can_shift_osds set */
+ if (!set->size && set->primary == -1)
+ return true;
+
+ /* empty !can_shift_osds set - all NONE */
+ if (set->size > 0 && set->primary == -1) {
+ int i;
+
+ for (i = 0; i < set->size; i++) {
+ if (set->osds[i] != CRUSH_ITEM_NONE)
+ break;
+ }
+ if (i == set->size)
+ return true;
+ }
+
+ return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+ memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+ dest->size = src->size;
+ dest->primary = src->primary;
+}
+
+static bool is_split(const struct ceph_pg *pgid,
+ u32 old_pg_num,
+ u32 new_pg_num)
+{
+ int old_bits = calc_bits_of(old_pg_num);
+ int old_mask = (1 << old_bits) - 1;
+ int n;
+
+ WARN_ON(pgid->seed >= old_pg_num);
+ if (new_pg_num <= old_pg_num)
+ return false;
+
+ for (n = 1; ; n++) {
+ int next_bit = n << (old_bits - 1);
+ u32 s = next_bit | pgid->seed;
+
+ if (s < old_pg_num || s == pgid->seed)
+ continue;
+ if (s >= new_pg_num)
+ break;
+
+ s = ceph_stable_mod(s, old_pg_num, old_mask);
+ if (s == pgid->seed)
+ return true;
+ }
+
+ return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ const struct ceph_pg *pgid)
+{
+ return !osds_equal(old_acting, new_acting) ||
+ !osds_equal(old_up, new_up) ||
+ old_size != new_size ||
+ old_min_size != new_min_size ||
+ is_split(pgid, old_pg_num, new_pg_num) ||
+ old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+ int i;
+
+ for (i = 0; i < acting->size; i++) {
+ if (acting->osds[i] == osd)
+ return i;
+ }
+
+ return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting)
+{
+ if (!old_acting->size && !new_acting->size)
+ return false; /* both still empty */
+ if (!old_acting->size ^ !new_acting->size)
+ return true; /* was empty, now not, or vice versa */
+ if (old_acting->primary != new_acting->primary)
+ return true; /* primary changed */
+
+ if (calc_pg_rank(old_acting->primary, old_acting) !=
+ calc_pg_rank(new_acting->primary, new_acting))
+ return true;
+
+ return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change)
+{
+ if (primary_changed(old_acting, new_acting))
+ return true;
+
+ if (any_change && !__osds_equal(old_acting, new_acting))
+ return true;
+
+ return false;
+}
/*
* calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/*
- * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
*/
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
- struct ceph_object_locator *oloc,
- struct ceph_object_id *oid,
- struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
- pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+ pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
- return -EIO;
+ return -ENOENT;
- pg_out->pool = oloc->pool;
- pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
- dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
- pg_out->pool, pg_out->seed);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
return 0;
}
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_pg *pgid)
+{
+ pgid->pool = raw_pgid->pool;
+ pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+ pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed). Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid)
+{
+ if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+ /* hash pool id and seed so that pool PGs do not overlap */
+ return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(raw_pgid->seed,
+ pi->pgp_num,
+ pi->pgp_num_mask),
+ raw_pgid->pool);
+ } else {
+ /*
+ * legacy behavior: add ps and pool together. this is
+ * not a great approach because the PGs from each pool
+ * will overlap on top of each other: 0.5 == 1.4 ==
+ * 2.3 == ...
+ */
+ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+ pi->pgp_num_mask) +
+ (unsigned)raw_pgid->pool;
+ }
+}
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
}
/*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG. The result may
+ * contain nonexistent OSDs. ->primary is undefined for a raw set.
*
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
*/
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *raw,
+ u32 *ppps)
{
+ u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno;
int len;
- /* crush */
- ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
- pool->type, pool->size);
+ ceph_osds_init(raw);
+ if (ppps)
+ *ppps = pps;
+
+ ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+ pi->size);
if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
- pgid.pool, pool->crush_ruleset, pool->type,
- pool->size);
- return -ENOENT;
+ pi->id, pi->crush_ruleset, pi->type, pi->size);
+ return;
}
- len = do_crush(osdmap, ruleno, pps, osds,
- min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+ len = do_crush(osdmap, ruleno, pps, raw->osds,
+ min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
osdmap->osd_weight, osdmap->max_osd);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
- len, ruleno, pgid.pool, pool->crush_ruleset,
- pool->type, pool->size);
- return len;
+ len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+ pi->size);
+ return;
}
- return len;
+ raw->size = len;
}
/*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary. By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
*
- * Return up set length. *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set. If it's
+ * empty, ->primary will remain undefined.
*/
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
{
- int up_primary = -1;
int i;
- if (ceph_can_shift_osds(pool)) {
+ /* ->primary is undefined for a raw set */
+ BUG_ON(set->primary != -1);
+
+ if (ceph_can_shift_osds(pi)) {
int removed = 0;
- for (i = 0; i < len; i++) {
- if (ceph_osd_is_down(osdmap, osds[i])) {
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
- osds[i - removed] = osds[i];
+ set->osds[i - removed] = set->osds[i];
}
-
- len -= removed;
- if (len > 0)
- up_primary = osds[0];
+ set->size -= removed;
+ if (set->size > 0)
+ set->primary = set->osds[0];
} else {
- for (i = len - 1; i >= 0; i--) {
- if (ceph_osd_is_down(osdmap, osds[i]))
- osds[i] = CRUSH_ITEM_NONE;
+ /* set down/dne devices to NONE */
+ for (i = set->size - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
else
- up_primary = osds[i];
+ set->primary = set->osds[i];
}
}
-
- *primary = up_primary;
- return len;
}
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ u32 pps,
+ struct ceph_osds *up)
{
int i;
int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (!osdmap->osd_primary_affinity)
return;
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
break;
}
}
- if (i == len)
+ if (i == up->size)
return;
/*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
* osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary.
*/
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
u32 aff;
if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (pos < 0)
return;
- *primary = osds[pos];
+ up->primary = up->osds[pos];
- if (ceph_can_shift_osds(pool) && pos > 0) {
+ if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */
for (i = pos; i > 0; i--)
- osds[i] = osds[i - 1];
- osds[0] = *primary;
+ up->osds[i] = up->osds[i - 1];
+ up->osds[0] = up->primary;
}
}
/*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
*
- * Return acting set length. *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings. This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
*/
-static int apply_temps(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
- int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *temp)
{
+ struct ceph_pg pgid;
struct ceph_pg_mapping *pg;
- int temp_len;
- int temp_primary;
int i;
- /* raw_pg -> pg */
- pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
- pool->pg_num_mask);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+ ceph_osds_init(temp);
/* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
- temp_len = 0;
- temp_primary = -1;
-
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
- if (ceph_can_shift_osds(pool))
+ if (ceph_can_shift_osds(pi))
continue;
- else
- osds[temp_len++] = CRUSH_ITEM_NONE;
+
+ temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else {
- osds[temp_len++] = pg->pg_temp.osds[i];
+ temp->osds[temp->size++] = pg->pg_temp.osds[i];
}
}
/* apply pg_temp's primary */
- for (i = 0; i < temp_len; i++) {
- if (osds[i] != CRUSH_ITEM_NONE) {
- temp_primary = osds[i];
+ for (i = 0; i < temp->size; i++) {
+ if (temp->osds[i] != CRUSH_ITEM_NONE) {
+ temp->primary = temp->osds[i];
break;
}
}
- } else {
- temp_len = len;
- temp_primary = *primary;
}
/* primary_temp? */
pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
- temp_primary = pg->primary_temp.osd;
-
- *primary = temp_primary;
- return temp_len;
+ temp->primary = pg->primary_temp.osd;
}
/*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
*
- * Return acting set length, or error. *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
*/
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting)
{
- struct ceph_pg_pool_info *pool;
+ struct ceph_pg_pool_info *pi;
u32 pps;
- int len;
- pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
- if (!pool) {
- *primary = -1;
- return -ENOENT;
+ pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+ if (!pi) {
+ ceph_osds_init(up);
+ ceph_osds_init(acting);
+ goto out;
}
- if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
- /* hash pool id and seed so that pool PGs do not overlap */
- pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
- ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask),
- pgid.pool);
- } else {
- /*
- * legacy behavior: add ps and pool together. this is
- * not a great approach because the PGs from each pool
- * will overlap on top of each other: 0.5 == 1.4 ==
- * 2.3 == ...
- */
- pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask) +
- (unsigned)pgid.pool;
- }
-
- len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
- if (len < 0) {
- *primary = -1;
- return len;
+ pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+ raw_to_up_osds(osdmap, pi, up);
+ apply_primary_affinity(osdmap, pi, pps, up);
+ get_temp_osds(osdmap, pi, raw_pgid, acting);
+ if (!acting->size) {
+ memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+ acting->size = up->size;
+ if (acting->primary == -1)
+ acting->primary = up->primary;
}
-
- len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
- apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
- len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
- return len;
+out:
+ WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
/*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
*/
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid)
{
- int osds[CEPH_PG_MAX_SIZE];
- int primary;
-
- ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+ struct ceph_osds up, acting;
- return primary;
+ ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+ return acting.primary;
}
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
diff --git a/net/compat.c b/net/compat.c
index 5cfd26a0006f..1cd2ec046164 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -309,8 +309,8 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
__scm_destroy(scm);
}
-static int do_set_attach_filter(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */
+struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval)
{
struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
@@ -323,6 +323,19 @@ static int do_set_attach_filter(struct socket *sock, int level, int optname,
__get_user(ptr, &fprog32->filter) ||
__put_user(len, &kfprog->len) ||
__put_user(compat_ptr(ptr), &kfprog->filter))
+ return NULL;
+
+ return kfprog;
+}
+EXPORT_SYMBOL_GPL(get_compat_bpf_fprog);
+
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock_fprog __user *kfprog;
+
+ kfprog = get_compat_bpf_fprog(optval);
+ if (!kfprog)
return -EFAULT;
return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
@@ -354,7 +367,8 @@ static int do_set_sock_timeout(struct socket *sock, int level,
static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
- if (optname == SO_ATTACH_FILTER)
+ if (optname == SO_ATTACH_FILTER ||
+ optname == SO_ATTACH_REUSEPORT_CBPF)
return do_set_attach_filter(sock, level, optname,
optval, optlen);
if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index bdb4013581b1..f4034817d255 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -84,8 +84,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
[NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
- [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
- [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
+ [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
+ [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index f96ee8b9478d..be873e4e3125 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -47,6 +47,7 @@ nla_put_failure:
* @xstats_type: TLV type for backward compatibility xstats TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
@@ -87,6 +88,7 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat);
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
* @d: dumping handle
+ * @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index 941c28486896..2cab489ae62e 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -55,18 +55,21 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
spin_lock_irqsave(&bm_pool->lock, flags);
if (bm_pool->buf_num == bm_pool->size) {
pr_warn("pool already filled\n");
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
return bm_pool->buf_num;
}
if (buf_num + bm_pool->buf_num > bm_pool->size) {
pr_warn("cannot allocate %d buffers for pool\n",
buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
return 0;
}
if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
buf_num, bm_pool->buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
return 0;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2b3f76fe65f4..7a0b616557ab 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
#include <linux/jiffies.h>
#include <linux/pm_runtime.h>
#include <linux/of.h>
+#include <linux/of_net.h>
#include "net-sysfs.h"
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8604ae245960..8b02df0d354d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
- if (remaining <= 0) {
- pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
- return;
- }
+ if (remaining <= 0)
+ goto out;
start_time = ktime_get();
if (remaining < 100000) {
@@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
}
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ destroy_hrtimer_on_stack(&t.timer);
}
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index c79b85eb4d4c..8737412c7b27 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -281,7 +281,7 @@ static int __init init_dns_resolver(void)
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index ca207dbf673b..116187b5c267 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -1289,8 +1289,8 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
nl802154_dev_addr_policy))
return -EINVAL;
- if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] &&
- !attrs[NL802154_DEV_ADDR_ATTR_MODE] &&
+ if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] ||
+ !attrs[NL802154_DEV_ADDR_ATTR_MODE] ||
!(attrs[NL802154_DEV_ADDR_ATTR_SHORT] ||
attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]))
return -EINVAL;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2e6e65fc4d20..d39e9e47a26e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,8 +1192,8 @@ int inet_sk_rebuild_header(struct sock *sk)
}
EXPORT_SYMBOL(inet_sk_rebuild_header);
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
bool udpfrag = false, fixedid = false, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
@@ -1205,24 +1205,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
int ihl;
int id;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TCP_FIXEDID |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_PARTIAL |
- 0)))
- goto out;
-
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
@@ -1298,9 +1280,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
out:
return segs;
}
+EXPORT_SYMBOL(inet_gso_segment);
-static struct sk_buff **inet_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
@@ -1416,6 +1398,7 @@ out:
return pp;
}
+EXPORT_SYMBOL(inet_gro_receive);
static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
@@ -1467,7 +1450,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
return -EINVAL;
}
-static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
__be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
@@ -1497,11 +1480,12 @@ out_unlock:
return err;
}
+EXPORT_SYMBOL(inet_gro_complete);
static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
skb->encapsulation = 1;
- skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
return inet_gro_complete(skb, nhoff);
}
@@ -1697,6 +1681,14 @@ static __net_init int inet_init_net(struct net *net)
*/
net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+
+ /* Default values for sysctl-controlled parameters.
+ * We set them here, in case sysctl is not compiled.
+ */
+ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_dynaddr = 0;
+ net->ipv4.sysctl_ip_early_demux = 1;
+
return 0;
}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index eeec7d60e5fd..5f9207c039e7 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -21,6 +21,7 @@ struct fou {
u8 protocol;
u8 flags;
__be16 port;
+ u8 family;
u16 type;
struct list_head list;
struct rcu_head rcu;
@@ -47,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static int fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
{
- struct iphdr *iph = ip_hdr(skb);
-
/* Remove 'len' bytes from the packet (UDP header and
* FOU header if present).
*/
- iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
@@ -68,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- if (fou_recv_pull(skb, sizeof(struct udphdr)))
+ if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
goto drop;
return -fou->protocol;
@@ -141,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
hdrlen = sizeof(struct guehdr) + optlen;
- ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
/* Pull csum through the guehdr now . This can be used if
* there is a remote checksum offload.
@@ -426,7 +434,8 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (fou->port == fout->port) {
+ if (fou->port == fout->port &&
+ fou->family == fout->family) {
mutex_unlock(&fn->fou_lock);
return -EALREADY;
}
@@ -448,31 +457,13 @@ static void fou_release(struct fou *fou)
kfree_rcu(fou, rcu);
}
-static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
- udp_sk(sk)->encap_rcv = fou_udp_recv;
- udp_sk(sk)->gro_receive = fou_gro_receive;
- udp_sk(sk)->gro_complete = fou_gro_complete;
- fou_from_sock(sk)->protocol = cfg->protocol;
-
- return 0;
-}
-
-static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
- udp_sk(sk)->encap_rcv = gue_udp_recv;
- udp_sk(sk)->gro_receive = gue_gro_receive;
- udp_sk(sk)->gro_complete = gue_gro_complete;
-
- return 0;
-}
-
static int fou_create(struct net *net, struct fou_cfg *cfg,
struct socket **sockp)
{
struct socket *sock = NULL;
struct fou *fou = NULL;
struct sock *sk;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
int err;
/* Open UDP socket */
@@ -489,35 +480,36 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk = sock->sk;
- fou->flags = cfg->flags;
fou->port = cfg->udp_config.local_udp_port;
+ fou->family = cfg->udp_config.family;
+ fou->flags = cfg->flags;
+ fou->type = cfg->type;
+ fou->sock = sock;
+
+ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.sk_user_data = fou;
+ tunnel_cfg.encap_destroy = NULL;
/* Initial for fou type */
switch (cfg->type) {
case FOU_ENCAP_DIRECT:
- err = fou_encap_init(sk, fou, cfg);
- if (err)
- goto error;
+ tunnel_cfg.encap_rcv = fou_udp_recv;
+ tunnel_cfg.gro_receive = fou_gro_receive;
+ tunnel_cfg.gro_complete = fou_gro_complete;
+ fou->protocol = cfg->protocol;
break;
case FOU_ENCAP_GUE:
- err = gue_encap_init(sk, fou, cfg);
- if (err)
- goto error;
+ tunnel_cfg.encap_rcv = gue_udp_recv;
+ tunnel_cfg.gro_receive = gue_gro_receive;
+ tunnel_cfg.gro_complete = gue_gro_complete;
break;
default:
err = -EINVAL;
goto error;
}
- fou->type = cfg->type;
-
- udp_sk(sk)->encap_type = 1;
- udp_encap_enable();
-
- sk->sk_user_data = fou;
- fou->sock = sock;
-
- inet_inc_convert_csum(sk);
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
sk->sk_allocation = GFP_ATOMIC;
@@ -542,12 +534,13 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
{
struct fou_net *fn = net_generic(net, fou_net_id);
__be16 port = cfg->udp_config.local_udp_port;
+ u8 family = cfg->udp_config.family;
int err = -EINVAL;
struct fou *fou;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fou, &fn->fou_list, list) {
- if (fou->port == port) {
+ if (fou->port == port && fou->family == family) {
fou_release(fou);
err = 0;
break;
@@ -585,8 +578,15 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
- if (family != AF_INET)
- return -EINVAL;
+ switch (family) {
+ case AF_INET:
+ break;
+ case AF_INET6:
+ cfg->udp_config.ipv6_v6only = 1;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
cfg->udp_config.family = family;
}
@@ -677,6 +677,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
struct fou_cfg cfg;
struct fou *fout;
__be16 port;
+ u8 family;
int ret;
ret = parse_nl_config(info, &cfg);
@@ -686,6 +687,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
if (port == 0)
return -EINVAL;
+ family = cfg.udp_config.family;
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
@@ -693,7 +698,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
ret = -ESRCH;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (port == fout->port) {
+ if (port == fout->port && family == fout->family) {
ret = fou_dump_info(fout, info->snd_portid,
info->snd_seq, 0, msg,
info->genlhdr->cmd);
@@ -798,6 +803,22 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
*protocol = IPPROTO_UDP;
}
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
+{
+ int err;
+
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
+
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
@@ -806,26 +827,21 @@ int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
__be16 sport;
int err;
- err = iptunnel_handle_offloads(skb, type);
+ err = __fou_build_header(skb, e, protocol, &sport, type);
if (err)
return err;
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
fou_build_udp(skb, e, fl4, protocol, sport);
return 0;
}
EXPORT_SYMBOL(fou_build_header);
-int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
- u8 *protocol, struct flowi4 *fl4)
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
{
- int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
- SKB_GSO_UDP_TUNNEL;
struct guehdr *guehdr;
size_t hdrlen, optlen = 0;
- __be16 sport;
void *data;
bool need_priv = false;
int err;
@@ -844,8 +860,8 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return err;
/* Get source port (based on flow hash) before skb_push */
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
hdrlen = sizeof(struct guehdr) + optlen;
@@ -890,6 +906,22 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
}
+ return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+ int err;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
fou_build_udp(skb, e, fl4, protocol, sport);
return 0;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e88190a8699a..ecd1e09dbbf1 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -26,20 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
int gre_offset, outer_hlen;
bool need_csum, ufo;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCP_FIXEDID |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_PARTIAL)))
- goto out;
-
if (!skb->encapsulation)
goto out;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a69ed94bda1b..d8f5e0a269f5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -443,29 +443,6 @@ drop:
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
-static int ip_encap_hlen(struct ip_tunnel_encap *e)
-{
- const struct ip_tunnel_encap_ops *ops;
- int hlen = -EINVAL;
-
- if (e->type == TUNNEL_ENCAP_NONE)
- return 0;
-
- if (e->type >= MAX_IPTUN_ENCAP_OPS)
- return -EINVAL;
-
- rcu_read_lock();
- ops = rcu_dereference(iptun_encaps[e->type]);
- if (likely(ops && ops->encap_hlen))
- hlen = ops->encap_hlen(e);
- rcu_read_unlock();
-
- return hlen;
-}
-
-const struct ip_tunnel_encap_ops __rcu *
- iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
-
int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
unsigned int num)
{
@@ -519,28 +496,6 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t,
}
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
-int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
- u8 *protocol, struct flowi4 *fl4)
-{
- const struct ip_tunnel_encap_ops *ops;
- int ret = -EINVAL;
-
- if (t->encap.type == TUNNEL_ENCAP_NONE)
- return 0;
-
- if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
- return -EINVAL;
-
- rcu_read_lock();
- ops = rcu_dereference(iptun_encaps[t->encap.type]);
- if (likely(ops && ops->build_header))
- ret = ops->build_header(skb, &t->encap, protocol, fl4);
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL(ip_tunnel_encap);
-
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df,
const struct iphdr *inner_iph)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 9118b0e640ba..afd6b5968caf 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -37,6 +37,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -47,6 +48,14 @@
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(iptun_encaps);
+
+const struct ip6_tnl_encap_ops __rcu *
+ ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(ip6tun_encaps);
+
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 92827483ee3d..978370132f29 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
goto tx_error;
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index bb0419582b8d..1cb67de106fe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -999,10 +999,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!net->ipv4.sysctl_local_reserved_ports)
goto err_ports;
- net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
- net->ipv4.sysctl_ip_dynaddr = 0;
- net->ipv4.sysctl_ip_early_demux = 1;
-
return 0;
err_ports:
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 02737b607aa7..5c5964962d0c 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -83,25 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCP_FIXEDID |
- SKB_GSO_TCPV6 |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- 0) ||
- !(type & (SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2e3ebfe5549e..0ff31d97d485 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1565,7 +1565,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/* if we're overly short, let UDP handle it */
encap_rcv = ACCESS_ONCE(up->encap_rcv);
- if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+ if (encap_rcv) {
int ret;
/* Verify checksum before giving to encap */
@@ -1618,12 +1618,12 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (rcu_access_pointer(sk->sk_filter)) {
- if (udp_lib_checksum_complete(skb))
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter(sk, skb))
- goto drop;
- }
+
+ if (sk_filter(sk, skb))
+ goto drop;
udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6b7459c92bb2..81f253b6ff36 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -209,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_IPIP |
- SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 3f8411328de5..2343e4f2e0bf 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -232,6 +232,15 @@ config IPV6_GRE
Saying M here will produce a module called ip6_gre. If unsure, say N.
+config IPV6_FOU
+ tristate
+ default NET_FOU && IPV6
+
+config IPV6_FOU_TUNNEL
+ tristate
+ default NET_FOU_IP_TUNNELS && IPV6_FOU
+ select IPV6_TUNNEL
+
config IPV6_MULTIPLE_TABLES
bool "IPv6: Multiple Routing Tables"
select FIB_RULES
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 5e9d6bf4aaca..6d8ea099213e 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
obj-$(CONFIG_IPV6_SIT) += sit.o
obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
+obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
new file mode 100644
index 000000000000..9ea249b9451e
--- /dev/null
+++ b/net/ipv6/fou6.c
@@ -0,0 +1,140 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/fou.h>
+#include <net/ip.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+
+static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi6 *fl6, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb,
+ &fl6->saddr, &fl6->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+ __be16 sport;
+ int err;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ err = __fou_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou6_build_udp(skb, e, fl6, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(fou6_build_header);
+
+int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+ __be16 sport;
+ int err;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
+ fou6_build_udp(skb, e, fl6, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(gue6_build_header);
+
+#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL)
+
+static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou6_build_header,
+};
+
+static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue6_build_header,
+};
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip6_tnl_encap_add_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou6 ops\n");
+ return ret;
+ }
+
+ ret = ip6_tnl_encap_add_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue6 ops\n");
+ ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+ ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU);
+ ip6_tnl_encap_del_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip6_tnl_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip6_tnl_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static int __init fou6_init(void)
+{
+ int ret;
+
+ ret = ip6_tnl_encap_add_fou_ops();
+
+ return ret;
+}
+
+static void __exit fou6_fini(void)
+{
+ ip6_tnl_encap_del_fou_ops();
+}
+
+module_init(fou6_init);
+module_exit(fou6_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4541fa54035e..fdc9de276ab1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -712,6 +712,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
fl6->daddr = p->raddr;
fl6->flowi6_oif = p->link;
fl6->flowlabel = 0;
+ fl6->flowi6_proto = IPPROTO_GRE;
if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
@@ -729,7 +730,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
t->tun_hlen = gre_calc_hlen(t->parms.o_flags);
- t->hlen = t->tun_hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
t_hlen = t->hlen + sizeof(struct ipv6hdr);
@@ -1022,13 +1023,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
}
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
-
- tunnel->hlen = tunnel->tun_hlen;
-
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->mtu -= ETH_HLEN;
if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
@@ -1255,6 +1256,8 @@ static int ip6gre_tap_init(struct net_device *dev)
if (ret)
return ret;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
tunnel = netdev_priv(dev);
ip6gre_tnl_link_config(tunnel, 1);
@@ -1288,6 +1291,40 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+}
+
+static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
}
static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
@@ -1296,9 +1333,18 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
struct ip6_tnl *nt;
struct net *net = dev_net(dev);
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct ip_tunnel_encap ipencap;
int err;
nt = netdev_priv(dev);
+
+ if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
ip6gre_netlink_parms(data, &nt->parms);
if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
@@ -1315,11 +1361,15 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
dev->hw_features |= GRE6_FEATURES;
if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP segmentation offload is not supported when we
- * generate output sequences.
+ /* TCP offload with GRE SEQ is not supported, nor
+ * can we support 2 levels of outer headers requiring
+ * an update.
*/
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
+ (nt->encap.type == TUNNEL_ENCAP_NONE)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
/* Can use a lockless transmit, unless we generate
* output sequences
@@ -1345,10 +1395,18 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
struct net *net = nt->net;
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
struct __ip6_tnl_parm p;
+ struct ip_tunnel_encap ipencap;
if (dev == ign->fb_tunnel_dev)
return -EINVAL;
+ if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
ip6gre_netlink_parms(data, &p);
t = ip6gre_tunnel_locate(net, &p, 0);
@@ -1400,6 +1458,14 @@ static size_t ip6gre_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_GRE_FLAGS */
nla_total_size(4) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
0;
}
@@ -1422,6 +1488,17 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags))
goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.flags))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -1440,6 +1517,10 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
[IFLA_GRE_FLOWINFO] = { .type = NLA_U32 },
[IFLA_GRE_FLAGS] = { .type = NLA_U32 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index f185cbcda114..94611e450ec9 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -223,6 +223,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
unsigned int nhoff;
int nexthdr;
bool raw;
+ bool have_final = false;
/*
* Parse extension headers
@@ -236,14 +237,27 @@ resubmit:
nhoff = IP6CB(skb)->nhoff;
nexthdr = skb_network_header(skb)[nhoff];
+resubmit_final:
raw = raw6_local_deliver(skb, nexthdr);
ipprot = rcu_dereference(inet6_protos[nexthdr]);
if (ipprot) {
int ret;
- if (ipprot->flags & INET6_PROTO_FINAL) {
+ if (have_final) {
+ if (!(ipprot->flags & INET6_PROTO_FINAL)) {
+ /* Once we've seen a final protocol don't
+ * allow encapsulation on any non-final
+ * ones. This allows foo in UDP encapsulation
+ * to work.
+ */
+ goto discard;
+ }
+ } else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+ /* Only do this once for first final protocol */
+ have_final = true;
+
/* Free reference early: we don't need it any more,
and it may hold ip_conntrack module loaded
indefinitely. */
@@ -263,10 +277,21 @@ resubmit:
goto discard;
ret = ipprot->handler(skb);
- if (ret > 0)
- goto resubmit;
- else if (ret == 0)
+ if (ret > 0) {
+ if (ipprot->flags & INET6_PROTO_FINAL) {
+ /* Not an extension header, most likely UDP
+ * encapsulation. Use return value as nexthdr
+ * protocol not nhoff (which presumably is
+ * not set by handler).
+ */
+ nexthdr = ret;
+ goto resubmit_final;
+ } else {
+ goto resubmit;
+ }
+ } else if (ret == 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
+ }
} else {
if (!raw) {
if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index f5eb184e1093..22e90e56b5a9 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -16,6 +16,7 @@
#include <net/protocol.h>
#include <net/ipv6.h>
+#include <net/inet_common.h>
#include "ip6_offload.h"
@@ -69,24 +70,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
bool encap, udpfrag;
int nhoff;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCP_FIXEDID |
- SKB_GSO_TCPV6 |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_PARTIAL |
- 0)))
- goto out;
-
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
@@ -104,7 +87,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
if (skb->encapsulation &&
- skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
udpfrag = proto == IPPROTO_UDP && encap;
else
udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
@@ -271,9 +254,11 @@ out:
return pp;
}
-static struct sk_buff **sit_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
{
+ /* Common GRO receive for SIT and IP6IP6 */
+
if (NAPI_GRO_CB(skb)->encap_mark) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
@@ -284,6 +269,21 @@ static struct sk_buff **sit_gro_receive(struct sk_buff **head,
return ipv6_gro_receive(head, skb);
}
+static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ /* Common GRO receive for SIT and IP6IP6 */
+
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return inet_gro_receive(head, skb);
+}
+
static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
@@ -312,10 +312,24 @@ out_unlock:
static int sit_gro_complete(struct sk_buff *skb, int nhoff)
{
skb->encapsulation = 1;
- skb_shinfo(skb)->gso_type |= SKB_GSO_SIT;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
return ipv6_gro_complete(skb, nhoff);
}
+static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+ return ipv6_gro_complete(skb, nhoff);
+}
+
+static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+ return inet_gro_complete(skb, nhoff);
+}
+
static struct packet_offload ipv6_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.callbacks = {
@@ -328,11 +342,26 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
static const struct net_offload sit_offload = {
.callbacks = {
.gso_segment = ipv6_gso_segment,
- .gro_receive = sit_gro_receive,
+ .gro_receive = sit_ip6ip6_gro_receive,
.gro_complete = sit_gro_complete,
},
};
+static const struct net_offload ip4ip6_offload = {
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = ip4ip6_gro_receive,
+ .gro_complete = ip4ip6_gro_complete,
+ },
+};
+
+static const struct net_offload ip6ip6_offload = {
+ .callbacks = {
+ .gso_segment = ipv6_gso_segment,
+ .gro_receive = sit_ip6ip6_gro_receive,
+ .gro_complete = ip6ip6_gro_complete,
+ },
+};
static int __init ipv6_offload_init(void)
{
@@ -344,6 +373,8 @@ static int __init ipv6_offload_init(void)
dev_add_offload(&ipv6_packet_offload);
inet_add_offload(&sit_offload, IPPROTO_IPV6);
+ inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6);
+ inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP);
return 0;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index cbf127ae7c67..635b8d340cdb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1071,17 +1071,12 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst)
{
struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
- int err;
dst = ip6_sk_dst_check(sk, dst, fl6);
+ if (!dst)
+ dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
- err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
- if (err)
- return ERR_PTR(err);
- if (final_dst)
- fl6->daddr = *final_dst;
-
- return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+ return dst;
}
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e79330f214bd..7b0481e3738f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1010,7 +1010,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
struct dst_entry *dst = NULL, *ndst = NULL;
struct net_device *tdev;
int mtu;
- unsigned int max_headroom = sizeof(struct ipv6hdr);
+ unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
+ unsigned int max_headroom = psh_hlen;
int err = -1;
/* NBMA tunnel */
@@ -1063,7 +1064,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
t->parms.name);
goto tx_err_dst_release;
}
- mtu = dst_mtu(dst) - sizeof(*ipv6h);
+ mtu = dst_mtu(dst) - psh_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
@@ -1119,16 +1120,18 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
}
- if (likely(!skb->encapsulation)) {
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
- }
-
+ /* Calculate max headroom for all the headers and adjust
+ * needed_headroom if necessary.
+ */
max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr)
- + dst->header_len;
+ + dst->header_len + t->hlen;
if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
+ err = ip6_tnl_encap(skb, t, &proto, fl6);
+ if (err)
+ return err;
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
@@ -1180,6 +1183,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+ return -1;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
IPPROTO_IPIP);
if (err != 0) {
@@ -1234,6 +1242,11 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
+ return -1;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
IPPROTO_IPV6);
if (err != 0) {
@@ -1280,6 +1293,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct net_device *dev = t->dev;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
+ int t_hlen;
memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
@@ -1303,6 +1317,10 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
else
dev->flags &= ~IFF_POINTOPOINT;
+ t->tun_hlen = 0;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
if (p->flags & IP6_TNL_F_CAP_XMIT) {
int strict = (ipv6_addr_type(&p->raddr) &
(IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
@@ -1316,9 +1334,9 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
if (rt->dst.dev) {
dev->hard_header_len = rt->dst.dev->hard_header_len +
- sizeof(struct ipv6hdr);
+ t_hlen;
- dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr);
+ dev->mtu = rt->dst.dev->mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
@@ -1564,6 +1582,59 @@ int ip6_tnl_get_iflink(const struct net_device *dev)
}
EXPORT_SYMBOL(ip6_tnl_get_iflink);
+int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip6_tnl_encap_ops **)
+ &ip6tun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_add_ops);
+
+int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip6_tnl_encap_ops **)
+ &ip6tun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip6_tnl_encap_del_ops);
+
+int ip6_tnl_encap_setup(struct ip6_tnl *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip6_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+
static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_init = ip6_tnl_dev_init,
.ndo_uninit = ip6_tnl_dev_uninit,
@@ -1574,6 +1645,11 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_get_iflink = ip6_tnl_get_iflink,
};
+#define IPXIPX_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
/**
* ip6_tnl_dev_setup - setup virtual tunnel device
@@ -1585,20 +1661,18 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
- struct ip6_tnl *t;
-
dev->netdev_ops = &ip6_tnl_netdev_ops;
dev->destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr);
- t = netdev_priv(dev);
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
+ dev->features |= NETIF_F_LLTX;
netif_keep_dst(dev);
+
+ dev->features |= IPXIPX_FEATURES;
+ dev->hw_features |= IPXIPX_FEATURES;
+
/* This perm addr will be used as interface identifier by IPv6 */
dev->addr_assign_type = NET_ADDR_RANDOM;
eth_random_addr(dev->perm_addr);
@@ -1615,6 +1689,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
int ret;
+ int t_hlen;
t->dev = dev;
t->net = dev_net(dev);
@@ -1630,8 +1705,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
if (ret)
goto destroy_dst;
- t->hlen = 0;
t->tun_hlen = 0;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+ t_hlen = t->hlen + sizeof(struct ipv6hdr);
+
+ dev->type = ARPHRD_TUNNEL6;
+ dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
return 0;
@@ -1729,13 +1811,55 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
}
+static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net *net = dev_net(dev);
struct ip6_tnl *nt, *t;
+ struct ip_tunnel_encap ipencap;
nt = netdev_priv(dev);
+
+ if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
ip6_tnl_netlink_parms(data, &nt->parms);
t = ip6_tnl_locate(net, &nt->parms, 0);
@@ -1752,10 +1876,17 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
struct __ip6_tnl_parm p;
struct net *net = t->net;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct ip_tunnel_encap ipencap;
if (dev == ip6n->fb_tnl_dev)
return -EINVAL;
+ if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ int err = ip6_tnl_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
ip6_tnl_netlink_parms(data, &p);
t = ip6_tnl_locate(net, &p, 0);
@@ -1796,6 +1927,14 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_IPTUN_PROTO */
nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
0;
}
@@ -1813,6 +1952,17 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -1836,6 +1986,10 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 },
[IFLA_IPTUN_FLAGS] = { .type = NLA_U32 },
[IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ip6_link_ops __read_mostly = {
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 6989c70ae29f..4a84b5ad9ecb 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
fl6.daddr = *gw;
fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) |
(iph->flow_lbl[1] << 8) | iph->flow_lbl[2]);
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a13d8c114ccb..0a5a255277e5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -913,7 +913,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
goto tx_error;
}
- if (iptunnel_handle_offloads(skb, SKB_GSO_SIT)) {
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
ip_rt_put(rt);
goto tx_error;
}
@@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
- if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
goto tx_error;
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 79e33e02f11a..f36c2d076fce 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1721,7 +1721,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
destp = ntohs(inet->inet_dport);
srcp = ntohs(inet->inet_sport);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2ba6a77a8815..f421c9f23c5b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -617,7 +617,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/* if we're overly short, let UDP handle it */
encap_rcv = ACCESS_ONCE(up->encap_rcv);
- if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+ if (encap_rcv) {
int ret;
/* Verify checksum before giving to encap */
@@ -653,12 +653,12 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (rcu_access_pointer(sk->sk_filter)) {
- if (udp_lib_checksum_complete(skb))
- goto csum_error;
- if (sk_filter(sk, skb))
- goto drop;
- }
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ if (sk_filter(sk, skb))
+ goto drop;
udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 5429f6bcf047..ac858c480f2f 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -36,19 +36,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type & ~(SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index da126ee6d218..873c4b707d6a 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -220,10 +220,11 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
/* Check if already open */
- if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) {
+ if (tty_port_initialized(&self->port)) {
pr_debug("%s(), already open so break out!\n", __func__);
return 0;
}
+ tty_port_set_initialized(&self->port, 1);
/* Register with IrCOMM */
irda_notify_init(&notify);
@@ -257,7 +258,7 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self)
return 0;
err:
- clear_bit(ASYNCB_INITIALIZED, &self->port.flags);
+ tty_port_set_initialized(&self->port, 0);
return ret;
}
@@ -280,8 +281,8 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
* If non-blocking mode is set, or the port is not enabled,
* then make the check up front and then exit.
*/
- if (test_bit(TTY_IO_ERROR, &tty->flags)) {
- port->flags |= ASYNC_NORMAL_ACTIVE;
+ if (tty_io_error(tty)) {
+ tty_port_set_active(port, 1);
return 0;
}
@@ -289,7 +290,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
/* nonblock mode is set */
if (C_BAUD(tty))
tty_port_raise_dtr_rts(port);
- port->flags |= ASYNC_NORMAL_ACTIVE;
+ tty_port_set_active(port, 1);
pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
return 0;
}
@@ -318,13 +319,12 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
spin_unlock_irqrestore(&port->lock, flags);
while (1) {
- if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+ if (C_BAUD(tty) && tty_port_initialized(port))
tty_port_raise_dtr_rts(port);
set_current_state(TASK_INTERRUPTIBLE);
- if (tty_hung_up_p(filp) ||
- !test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) {
retval = (port->flags & ASYNC_HUP_NOTIFY) ?
-EAGAIN : -ERESTARTSYS;
break;
@@ -365,7 +365,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
__FILE__, __LINE__, tty->driver->name, port->count);
if (!retval)
- port->flags |= ASYNC_NORMAL_ACTIVE;
+ tty_port_set_active(port, 1);
return retval;
}
@@ -876,8 +876,9 @@ static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
IRDA_ASSERT(self != NULL, return;);
IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
- if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags))
+ if (!tty_port_initialized(&self->port))
return;
+ tty_port_set_initialized(&self->port, 0);
ircomm_tty_detach_cable(self);
@@ -925,7 +926,6 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
ircomm_tty_shutdown(self);
spin_lock_irqsave(&port->lock, flags);
- port->flags &= ~ASYNC_NORMAL_ACTIVE;
if (port->tty) {
set_bit(TTY_IO_ERROR, &port->tty->flags);
tty_kref_put(port->tty);
@@ -933,6 +933,7 @@ static void ircomm_tty_hangup(struct tty_struct *tty)
port->tty = NULL;
port->count = 0;
spin_unlock_irqrestore(&port->lock, flags);
+ tty_port_set_active(port, 0);
wake_up_interruptible(&port->open_wait);
}
@@ -999,7 +1000,7 @@ void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
if (status & IRCOMM_DCE_DELTA_ANY) {
/*wake_up_interruptible(&self->delta_msr_wait);*/
}
- if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+ if (tty_port_check_carrier(&self->port) && (status & IRCOMM_DELTA_CD)) {
pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line,
(status & IRCOMM_CD) ? "on" : "off");
@@ -1255,11 +1256,11 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
seq_printf(m, "%cASYNC_CTS_FLOW", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_CHECK_CD) {
+ if (tty_port_check_carrier(&self->port)) {
seq_printf(m, "%cASYNC_CHECK_CD", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_INITIALIZED) {
+ if (tty_port_initialized(&self->port)) {
seq_printf(m, "%cASYNC_INITIALIZED", sep);
sep = '|';
}
@@ -1267,7 +1268,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
+ if (tty_port_active(&self->port)) {
seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
sep = '|';
}
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index 61137f8b5293..0a411019c098 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -968,7 +968,7 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
ircomm_tty_start_watchdog_timer(self, 3*HZ);
- if (self->port.flags & ASYNC_CHECK_CD) {
+ if (tty_port_check_carrier(&self->port)) {
/* Drop carrier */
self->settings.dce = IRCOMM_DELTA_CD;
ircomm_tty_check_modem_status(self);
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index d3687aaa23de..d4fdf8f7b471 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -86,21 +86,17 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
/* CTS flow control flag and modem status interrupts */
+ tty_port_set_cts_flow(&self->port, cflag & CRTSCTS);
if (cflag & CRTSCTS) {
- self->port.flags |= ASYNC_CTS_FLOW;
self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
/* This got me. Bummer. Jean II */
if (self->service_type == IRCOMM_3_WIRE_RAW)
net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n",
__func__);
} else {
- self->port.flags &= ~ASYNC_CTS_FLOW;
self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
}
- if (cflag & CLOCAL)
- self->port.flags &= ~ASYNC_CHECK_CD;
- else
- self->port.flags |= ASYNC_CHECK_CD;
+ tty_port_set_check_carrier(&self->port, ~cflag & CLOCAL);
#if 0
/*
* Set up parity check flag
@@ -166,7 +162,7 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
/* Handle transition away from B0 status */
if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
self->settings.dte |= IRCOMM_DTR;
- if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
+ if (!C_CRTSCTS(tty) || !tty_throttled(tty))
self->settings.dte |= IRCOMM_RTS;
ircomm_param_request(self, IRCOMM_DTE, TRUE);
}
@@ -190,7 +186,7 @@ int ircomm_tty_tiocmget(struct tty_struct *tty)
struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
unsigned int result;
- if (tty->flags & (1 << TTY_IO_ERROR))
+ if (tty_io_error(tty))
return -EIO;
result = ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
@@ -213,7 +209,7 @@ int ircomm_tty_tiocmset(struct tty_struct *tty,
{
struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
- if (tty->flags & (1 << TTY_IO_ERROR))
+ if (tty_io_error(tty))
return -EIO;
IRDA_ASSERT(self != NULL, return -1;);
@@ -328,7 +324,7 @@ static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
check_and_exit:
- if (self->flags & ASYNC_INITIALIZED) {
+ if (tty_port_initialized(self)) {
if (((old_state.flags & ASYNC_SPD_MASK) !=
(self->flags & ASYNC_SPD_MASK)) ||
(old_driver.custom_divisor != driver->custom_divisor)) {
@@ -362,7 +358,7 @@ int ircomm_tty_ioctl(struct tty_struct *tty,
if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
(cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
(cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
- if (tty->flags & (1 << TTY_IO_ERROR))
+ if (tty_io_error(tty))
return -EIO;
}
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 40662d73204f..0b68ba730a06 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1483,7 +1483,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
long timeo;
struct kcm_rx_msg *rxm;
int err = 0;
- size_t copied;
+ ssize_t copied;
struct sk_buff *skb;
/* Only support splice for SOCKSEQPACKET */
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 6edfa9980314..1e40dacaa137 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1581,7 +1581,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
tunnel->encap = encap;
if (encap == L2TP_ENCAPTYPE_UDP) {
- struct udp_tunnel_sock_cfg udp_cfg;
+ struct udp_tunnel_sock_cfg udp_cfg = { };
udp_cfg.sk_user_data = tunnel;
udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index c6f5df1bed12..6c54e03fe9c1 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -128,6 +128,7 @@ static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
*/
static int l2tp_ip6_recv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
struct sock *sk;
u32 session_id;
u32 tunnel_id;
@@ -154,7 +155,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_find(&init_net, NULL, session_id);
+ session = l2tp_session_find(net, NULL, session_id);
if (session == NULL)
goto discard;
@@ -188,14 +189,14 @@ pass_up:
goto discard;
tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
- tunnel = l2tp_tunnel_find(&init_net, tunnel_id);
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
if (tunnel != NULL)
sk = tunnel->sock;
else {
struct ipv6hdr *iph = ipv6_hdr(skb);
read_lock_bh(&l2tp_ip6_lock);
- sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr,
+ sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
0, tunnel_id);
read_unlock_bh(&l2tp_ip6_lock);
}
@@ -263,6 +264,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
+ struct net *net = sock_net(sk);
__be32 v4addr = 0;
int addr_type;
int err;
@@ -286,7 +288,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
err = -EADDRINUSE;
read_lock_bh(&l2tp_ip6_lock);
- if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr,
+ if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
sk->sk_bound_dev_if, addr->l2tp_conn_id))
goto out_in_use;
read_unlock_bh(&l2tp_ip6_lock);
@@ -456,7 +458,7 @@ static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb)
return 0;
drop:
- IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
kfree_skb(skb);
return -1;
}
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 5dba899131b3..182470847fcf 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -444,10 +444,9 @@ static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb,
break;
case LAPB_FRMR:
- lapb_dbg(1, "(%p) S3 RX FRMR(%d) %02X %02X %02X %02X %02X\n",
+ lapb_dbg(1, "(%p) S3 RX FRMR(%d) %5ph\n",
lapb->dev, frame->pf,
- skb->data[0], skb->data[1], skb->data[2],
- skb->data[3], skb->data[4]);
+ skb->data);
lapb_establish_data_link(lapb);
lapb_dbg(0, "(%p) S3 -> S1\n", lapb->dev);
lapb_requeue_frames(lapb);
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index ba4d015bd1a6..482c94d9d958 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -148,9 +148,7 @@ void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type)
}
}
- lapb_dbg(2, "(%p) S%d TX %02X %02X %02X\n",
- lapb->dev, lapb->state,
- skb->data[0], skb->data[1], skb->data[2]);
+ lapb_dbg(2, "(%p) S%d TX %3ph\n", lapb->dev, lapb->state, skb->data);
if (!lapb_data_transmit(lapb, skb))
kfree_skb(skb);
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 9d0a426eccbb..3c1914df641f 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -113,9 +113,7 @@ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb,
{
frame->type = LAPB_ILLEGAL;
- lapb_dbg(2, "(%p) S%d RX %02X %02X %02X\n",
- lapb->dev, lapb->state,
- skb->data[0], skb->data[1], skb->data[2]);
+ lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data);
/* We always need to look at 2 bytes, sometimes we need
* to look at 3 and those cases are handled below.
@@ -284,10 +282,9 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
dptr++;
*dptr++ = lapb->frmr_type;
- lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X %02X %02X\n",
+ lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n",
lapb->dev, lapb->state,
- skb->data[1], skb->data[2], skb->data[3],
- skb->data[4], skb->data[5]);
+ &skb->data[1]);
} else {
dptr = skb_put(skb, 4);
*dptr++ = LAPB_FRMR;
@@ -299,9 +296,8 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
dptr++;
*dptr++ = lapb->frmr_type;
- lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X\n",
- lapb->dev, lapb->state, skb->data[1],
- skb->data[2], skb->data[3]);
+ lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n",
+ lapb->dev, lapb->state, &skb->data[1]);
}
lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 4c6404e1ad6e..21b1fdf5d01d 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -161,6 +161,10 @@ void mesh_sta_cleanup(struct sta_info *sta)
del_timer_sync(&sta->mesh->plink_timer);
}
+ /* make sure no readers can access nexthop sta from here on */
+ mesh_path_flush_by_nexthop(sta);
+ synchronize_net();
+
if (changed)
ieee80211_mbss_info_change_notify(sdata, changed);
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index c8b8ccc370eb..78b0ef32dddd 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -280,7 +280,7 @@ struct ieee80211_fast_tx {
u8 sa_offs, da_offs, pn_offs;
u8 band;
u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
- sizeof(rfc1042_header)];
+ sizeof(rfc1042_header)] __aligned(2);
struct rcu_head rcu_head;
};
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index bbcf60465e5c..2055e57ed1c3 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -26,15 +26,6 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
netdev_features_t mpls_features;
__be16 mpls_protocol;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_FIXEDID |
- SKB_GSO_TCP_ECN)))
- goto out;
-
/* Setup inner SKB. */
mpls_protocol = skb->protocol;
skb->protocol = skb->inner_protocol;
@@ -57,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
* skb_mac_gso_segment(), an indirect caller of this function.
*/
__skb_pull(skb, skb->data - skb_mac_header(skb));
-out:
+
return segs;
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2cb3c626cd43..096a45103f14 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -762,7 +762,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs,
* If available, return 1, otherwise invalidate this connection
* template and return 0.
*/
-int ip_vs_check_template(struct ip_vs_conn *ct)
+int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
{
struct ip_vs_dest *dest = ct->dest;
struct netns_ipvs *ipvs = ct->ipvs;
@@ -772,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- expire_quiescent_template(ipvs, dest)) {
+ expire_quiescent_template(ipvs, dest) ||
+ (cdest && (dest != cdest))) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
"-> d:%s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1207f20d24e4..2c1b498a7a27 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -321,7 +321,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
- if (!ct || !ip_vs_check_template(ct)) {
+ if (!ct || !ip_vs_check_template(ct, NULL)) {
struct ip_vs_scheduler *sched;
/*
@@ -1154,7 +1154,8 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
vport, &param) < 0)
return NULL;
ct = ip_vs_ct_in_get(&param);
- if (!ct) {
+ /* check if template exists and points to the same dest */
+ if (!ct || !ip_vs_check_template(ct, dest)) {
ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
IP_VS_CONN_F_TEMPLATE, dest, 0);
if (!ct) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6d19d2eeaa60..01d3d894de46 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -932,17 +932,14 @@ error:
static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
{
- if (encaps_af == AF_INET) {
- if (orig_af == AF_INET)
- return SKB_GSO_IPIP;
-
- return SKB_GSO_SIT;
+ switch (encaps_af) {
+ case AF_INET:
+ return SKB_GSO_IPXIP4;
+ case AF_INET6:
+ return SKB_GSO_IPXIP6;
+ default:
+ return 0;
}
-
- /* GSO: we need to provide proper SKB_GSO_ value for IPv6:
- * SKB_GSO_SIT/IPV6
- */
- return 0;
}
/*
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 883c691ec8d0..19efeba02abb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -632,6 +632,7 @@ static int __init nf_conntrack_ftp_init(void)
if (ret) {
pr_err("failed to register helper for pf: %d port: %d\n",
ftp[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_ftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index f703adb7e5f7..196cb39649e1 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -361,9 +361,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_log);
int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
{
- int ret = 0;
- struct nf_conntrack_helper *cur;
+ struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
unsigned int h = helper_hash(&me->tuple);
+ struct nf_conntrack_helper *cur;
+ int ret = 0;
BUG_ON(me->expect_policy == NULL);
BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
@@ -371,9 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
mutex_lock(&nf_ct_helper_mutex);
hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
- if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
- cur->tuple.src.l3num == me->tuple.src.l3num &&
- cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+ if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) {
ret = -EEXIST;
goto out;
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 8b6da2719600..f97ac61d2536 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -271,6 +271,7 @@ static int __init nf_conntrack_irc_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
irc[i].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_irc_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 7523a575f6d1..3fcbaab83b3d 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -223,6 +223,7 @@ static int __init nf_conntrack_sane_init(void)
if (ret) {
pr_err("failed to register helper for pf: %d port: %d\n",
sane[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_sane_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 3e06402739e0..f72ba5587588 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1669,6 +1669,7 @@ static int __init nf_conntrack_sip_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
sip[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_sip_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f87e84ebcec3..c026c472ea80 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -487,8 +487,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
{ }
};
-#define NET_NF_CONNTRACK_MAX 2089
-
static struct ctl_table nf_ct_netfilter_table[] = {
{
.procname = "nf_conntrack_max",
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 36f964066461..2e65b5430fba 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -142,6 +142,7 @@ static int __init nf_conntrack_tftp_init(void)
if (ret) {
pr_err("failed to register helper for pf: %u port: %u\n",
tftp[i][j].tuple.src.l3num, ports[i]);
+ ports_c = i;
nf_conntrack_tftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 5baa8e24e6ac..b19ad20a705c 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -26,23 +26,21 @@
* Once the queue is registered it must reinject all packets it
* receives, no matter what.
*/
-static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
/* return EBUSY when somebody else is registered, return EEXIST if the
* same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(const struct nf_queue_handler *qh)
+void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
{
/* should never happen, we only have one queueing backend in kernel */
- WARN_ON(rcu_access_pointer(queue_handler));
- rcu_assign_pointer(queue_handler, qh);
+ WARN_ON(rcu_access_pointer(net->nf.queue_handler));
+ rcu_assign_pointer(net->nf.queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-void nf_unregister_queue_handler(void)
+void nf_unregister_queue_handler(struct net *net)
{
- RCU_INIT_POINTER(queue_handler, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(net->nf.queue_handler, NULL);
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
@@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
const struct nf_queue_handler *qh;
rcu_read_lock();
- qh = rcu_dereference(queue_handler);
+ qh = rcu_dereference(net->nf.queue_handler);
if (qh)
qh->nf_hook_drop(net, ops);
rcu_read_unlock();
@@ -122,9 +120,10 @@ int nf_queue(struct sk_buff *skb,
struct nf_queue_entry *entry = NULL;
const struct nf_afinfo *afinfo;
const struct nf_queue_handler *qh;
+ struct net *net = state->net;
/* QUEUE == DROP if no one is waiting, to be safe. */
- qh = rcu_dereference(queue_handler);
+ qh = rcu_dereference(net->nf.queue_handler);
if (!qh) {
status = -ESRCH;
goto err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4d292b933b5c..7b7aa871a174 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2647,6 +2647,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
/* Only accept unspec with dump */
if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
+ if (!nla[NFTA_SET_TABLE])
+ return -EINVAL;
set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
if (IS_ERR(set))
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index aa93877ab6e2..5d36a0926b4a 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -557,7 +557,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (entskb->tstamp.tv64) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -1482,21 +1482,29 @@ static int __net_init nfnl_queue_net_init(struct net *net)
net->nf.proc_netfilter, &nfqnl_file_ops))
return -ENOMEM;
#endif
+ nf_register_queue_handler(net, &nfqh);
return 0;
}
static void __net_exit nfnl_queue_net_exit(struct net *net)
{
+ nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
}
+static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
+{
+ synchronize_rcu();
+}
+
static struct pernet_operations nfnl_queue_net_ops = {
- .init = nfnl_queue_net_init,
- .exit = nfnl_queue_net_exit,
- .id = &nfnl_queue_net_id,
- .size = sizeof(struct nfnl_queue_net),
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .exit_batch = nfnl_queue_net_exit_batch,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
};
static int __init nfnetlink_queue_init(void)
@@ -1517,7 +1525,6 @@ static int __init nfnetlink_queue_init(void)
}
register_netdevice_notifier(&nfqnl_dev_notifier);
- nf_register_queue_handler(&nfqh);
return status;
cleanup_netlink_notifier:
@@ -1529,7 +1536,6 @@ out:
static void __exit nfnetlink_queue_fini(void)
{
- nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c69c892231d7..2675d580c490 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -612,7 +612,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
return -EINVAL;
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
- target_offset + sizeof(struct compat_xt_standard_target) != next_offset)
+ COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
return -EINVAL;
/* compat_xt_entry match has less strict aligment requirements,
@@ -694,7 +694,7 @@ int xt_check_entry_offsets(const void *base,
return -EINVAL;
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
- target_offset + sizeof(struct xt_standard_target) != next_offset)
+ XT_ALIGN(target_offset + sizeof(struct xt_standard_target)) != next_offset)
return -EINVAL;
return xt_check_entry_match(elems, base + target_offset,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 28cddc85b700..1325776daa27 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -677,7 +677,7 @@ int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
u32 spot = start;
while (rc == 0 && spot <= end) {
- if (((spot & (BITS_PER_LONG - 1)) != 0) &&
+ if (((spot & (BITS_PER_LONG - 1)) == 0) &&
((end - spot) > BITS_PER_LONG)) {
rc = netlbl_catmap_setlong(catmap,
spot,
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 879185fe183f..9a3eb7a0ebf4 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -137,11 +137,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
return !!key->eth.type;
}
+static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~(hdr->h_proto), ethertype };
+
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_mpls *mpls)
{
__be32 *new_mpls_lse;
- struct ethhdr *hdr;
/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
if (skb->encapsulation)
@@ -160,9 +172,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
- hdr = eth_hdr(skb);
- hdr->h_proto = mpls->mpls_ethertype;
-
+ update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
if (!skb->inner_protocol)
skb_set_inner_protocol(skb, skb->protocol);
skb->protocol = mpls->mpls_ethertype;
@@ -193,7 +203,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
* field correctly in the presence of VLAN tags.
*/
hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
- hdr->h_proto = ethertype;
+ update_ethertype(skb, hdr, ethertype);
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4040eb92d9c9..9bff6ef16fa7 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -93,6 +93,7 @@
#include <net/inet_common.h>
#endif
#include <linux/bpf.h>
+#include <net/compat.h>
#include "internal.h"
@@ -3940,6 +3941,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
}
+#ifdef CONFIG_COMPAT
+static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct packet_sock *po = pkt_sk(sock->sk);
+
+ if (level != SOL_PACKET)
+ return -ENOPROTOOPT;
+
+ if (optname == PACKET_FANOUT_DATA &&
+ po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
+ optval = (char __user *)get_compat_bpf_fprog(optval);
+ if (!optval)
+ return -EFAULT;
+ optlen = sizeof(struct sock_fprog);
+ }
+
+ return packet_setsockopt(sock, level, optname, optval, optlen);
+}
+#endif
+
static int packet_notifier(struct notifier_block *this,
unsigned long msg, void *ptr)
{
@@ -4416,6 +4438,9 @@ static const struct proto_ops packet_ops = {
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
.getsockopt = packet_getsockopt,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_packet_setsockopt,
+#endif
.sendmsg = packet_sendmsg,
.recvmsg = packet_recvmsg,
.mmap = packet_mmap,
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 93ff038ea9d1..d921adc62765 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -111,7 +111,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
cpu_relax();
}
- ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
if (unlikely(ret != ibmr->sg_len))
return ret < 0 ? ret : -EINVAL;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 80256b08eac0..387df5f32e49 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -74,6 +74,7 @@ enum {
RDS_CONN_CONNECTING,
RDS_CONN_DISCONNECTING,
RDS_CONN_UP,
+ RDS_CONN_RESETTING,
RDS_CONN_ERROR,
};
@@ -813,6 +814,7 @@ void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
+void rds_connect_path_complete(struct rds_connection *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c0be1ecd11c9..8413f6c99e13 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -561,5 +561,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
minfo.fport = inc->i_hdr.h_dport;
}
+ minfo.flags = 0;
+
rds_info_copy(iter, &minfo, sizeof(minfo));
}
diff --git a/net/rds/send.c b/net/rds/send.c
index c9cdb358ea88..b1962f8e30f7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -99,6 +99,7 @@ void rds_send_reset(struct rds_connection *conn)
list_splice_init(&conn->c_retrans, &conn->c_send_queue);
spin_unlock_irqrestore(&conn->c_lock, flags);
}
+EXPORT_SYMBOL_GPL(rds_send_reset);
static int acquire_in_xmit(struct rds_connection *conn)
{
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 86187dad1440..74ee126a6fe6 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -126,9 +126,81 @@ void rds_tcp_restore_callbacks(struct socket *sock,
}
/*
- * This is the only path that sets tc->t_sock. Send and receive trust that
- * it is set. The RDS_CONN_UP bit protects those paths from being
- * called while it isn't set.
+ * rds_tcp_reset_callbacks() switches the to the new sock and
+ * returns the existing tc->t_sock.
+ *
+ * The only functions that set tc->t_sock are rds_tcp_set_callbacks
+ * and rds_tcp_reset_callbacks. Send and receive trust that
+ * it is set. The absence of RDS_CONN_UP bit protects those paths
+ * from being called while it isn't set.
+ */
+void rds_tcp_reset_callbacks(struct socket *sock,
+ struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *osock = tc->t_sock;
+
+ if (!osock)
+ goto newsock;
+
+ /* Need to resolve a duelling SYN between peers.
+ * We have an outstanding SYN to this peer, which may
+ * potentially have transitioned to the RDS_CONN_UP state,
+ * so we must quiesce any send threads before resetting
+ * c_transport_data. We quiesce these threads by setting
+ * c_state to something other than RDS_CONN_UP, and then
+ * waiting for any existing threads in rds_send_xmit to
+ * complete release_in_xmit(). (Subsequent threads entering
+ * rds_send_xmit() will bail on !rds_conn_up().
+ *
+ * However an incoming syn-ack at this point would end up
+ * marking the conn as RDS_CONN_UP, and would again permit
+ * rds_send_xmi() threads through, so ideally we would
+ * synchronize on RDS_CONN_UP after lock_sock(), but cannot
+ * do that: waiting on !RDS_IN_XMIT after lock_sock() may
+ * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
+ * would not get set. As a result, we set c_state to
+ * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
+ * cannot mark rds_conn_path_up() in the window before lock_sock()
+ */
+ atomic_set(&conn->c_state, RDS_CONN_RESETTING);
+ wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags));
+ lock_sock(osock->sk);
+ /* reset receive side state for rds_tcp_data_recv() for osock */
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ tc->t_sock = NULL;
+
+ write_lock_bh(&osock->sk->sk_callback_lock);
+
+ osock->sk->sk_user_data = NULL;
+ osock->sk->sk_data_ready = tc->t_orig_data_ready;
+ osock->sk->sk_write_space = tc->t_orig_write_space;
+ osock->sk->sk_state_change = tc->t_orig_state_change;
+ write_unlock_bh(&osock->sk->sk_callback_lock);
+ release_sock(osock->sk);
+ sock_release(osock);
+newsock:
+ rds_send_reset(conn);
+ lock_sock(sock->sk);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ tc->t_sock = sock;
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = rds_tcp_data_ready;
+ sock->sk->sk_write_space = rds_tcp_write_space;
+ sock->sk->sk_state_change = rds_tcp_state_change;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+ release_sock(sock->sk);
+}
+
+/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
+ * above rds_tcp_reset_callbacks for notes about synchronization
+ * with data path
*/
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
{
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 41c228300525..ec0602b0dc24 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,6 +50,7 @@ struct rds_tcp_statistics {
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 49a3fcfed360..fba13d0305fb 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk)
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) {
state_change = sk->sk_state_change;
@@ -60,7 +60,7 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV:
break;
case TCP_ESTABLISHED:
- rds_connect_complete(conn);
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
@@ -69,7 +69,7 @@ void rds_tcp_state_change(struct sock *sk)
break;
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index be263cdf268b..686b1d03a558 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -78,7 +78,9 @@ int rds_tcp_accept_one(struct socket *sock)
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
- struct sock *nsk;
+
+ if (!sock) /* module unload or netns delete in progress */
+ return -ENETUNREACH;
ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
sock->sk->sk_type, sock->sk->sk_protocol,
@@ -129,28 +131,25 @@ int rds_tcp_accept_one(struct socket *sock)
* so we must quiesce any send threads before resetting
* c_transport_data.
*/
- wait_event(conn->c_waitq,
- !test_bit(RDS_IN_XMIT, &conn->c_flags));
- if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+ if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) ||
+ !conn->c_outgoing) {
goto rst_nsk;
- } else if (rs_tcp->t_sock) {
- rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+ } else {
+ rds_tcp_reset_callbacks(new_sock, conn);
conn->c_outgoing = 0;
+ /* rds_connect_path_complete() marks RDS_CONN_UP */
+ rds_connect_path_complete(conn, RDS_CONN_DISCONNECTING);
}
+ } else {
+ rds_tcp_set_callbacks(new_sock, conn);
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
}
- rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_complete(conn); /* marks RDS_CONN_UP */
new_sock = NULL;
ret = 0;
goto out;
rst_nsk:
/* reset the newly returned accept sock and bail */
- nsk = new_sock->sk;
- rds_tcp_stats_inc(s_tcp_listen_closed_stale);
- nsk->sk_user_data = NULL;
- nsk->sk_prot->disconnect(nsk, 0);
- tcp_done(nsk);
- new_sock = NULL;
+ kernel_sock_shutdown(new_sock, SHUT_RDWR);
ret = 0;
out:
if (rs_tcp)
@@ -166,7 +165,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
rdsdebug("listen data ready sk %p\n", sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (!ready) { /* check for teardown race */
ready = sk->sk_data_ready;
@@ -183,7 +182,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
rds_tcp_accept_work(sk);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index d75d8b56a9e3..c3196f9d070a 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -301,7 +301,7 @@ void rds_tcp_data_ready(struct sock *sk)
rdsdebug("data ready sk %p\n", sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) { /* check for teardown race */
ready = sk->sk_data_ready;
@@ -315,7 +315,7 @@ void rds_tcp_data_ready(struct sock *sk)
if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk);
}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 2894e6095e3b..22d0f2020a79 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -180,7 +180,7 @@ void rds_tcp_write_space(struct sock *sk)
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) {
write_space = sk->sk_write_space;
@@ -200,7 +200,7 @@ void rds_tcp_write_space(struct sock *sk)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
/*
* write_space is only called when data leaves tcp's send queue if
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 454aa6d23327..4a323045719b 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -71,9 +71,9 @@
struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq);
-void rds_connect_complete(struct rds_connection *conn)
+void rds_connect_path_complete(struct rds_connection *conn, int curr)
{
- if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+ if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n",
__func__,
@@ -90,6 +90,12 @@ void rds_connect_complete(struct rds_connection *conn)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
+EXPORT_SYMBOL_GPL(rds_connect_path_complete);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+ rds_connect_path_complete(conn, RDS_CONN_CONNECTING);
+}
EXPORT_SYMBOL_GPL(rds_connect_complete);
/*
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 3fb492eedeb9..1021b4c0bdd2 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -965,7 +965,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
key = key_alloc(&key_type_rxrpc, "x",
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
- KEY_ALLOC_NOT_IN_QUOTA);
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(key)) {
_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
return -ENOMEM;
@@ -1012,7 +1012,7 @@ struct key *rxrpc_get_null_key(const char *keyname)
key = key_alloc(&key_type_rxrpc, keyname,
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
- KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+ KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(key))
return key;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 6b726a046a7d..bab56ed649ba 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -1162,9 +1162,7 @@ static int rxkad_init(void)
/* pin the cipher we need so that the crypto layer doesn't invoke
* keventd to go get it */
rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(rxkad_ci))
- return PTR_ERR(rxkad_ci);
- return 0;
+ return PTR_ERR_OR_ZERO(rxkad_ci);
}
/*
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 330f14e302e8..c557789765dc 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -38,7 +38,7 @@ struct tcf_police {
bool peak_present;
};
#define to_police(pc) \
- container_of(pc, struct tcf_police, common)
+ container_of(pc->priv, struct tcf_police, common)
#define POL_TAB_MASK 15
@@ -119,14 +119,12 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
- unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
- struct tcf_hashinfo *hinfo = tn->hinfo;
int size;
if (nla == NULL)
@@ -145,7 +143,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
if (parm->index) {
if (tcf_hash_search(tn, a, parm->index)) {
- police = to_police(a->priv);
+ police = to_police(a);
if (bind) {
police->tcf_bindcnt += 1;
police->tcf_refcnt += 1;
@@ -156,16 +154,15 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
/* not replacing */
return -EEXIST;
}
+ } else {
+ ret = tcf_hash_create(tn, parm->index, NULL, a,
+ sizeof(*police), bind, false);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
}
- police = kzalloc(sizeof(*police), GFP_KERNEL);
- if (police == NULL)
- return -ENOMEM;
- ret = ACT_P_CREATED;
- police->tcf_refcnt = 1;
- spin_lock_init(&police->tcf_lock);
- if (bind)
- police->tcf_bindcnt = 1;
+ police = to_police(a);
override:
if (parm->rate.rate) {
err = -ENOMEM;
@@ -237,14 +234,8 @@ override:
return ret;
police->tcfp_t_c = ktime_get_ns();
- police->tcf_index = parm->index ? parm->index :
- tcf_hash_new_index(tn);
- h = tcf_hash(police->tcf_index, POL_TAB_MASK);
- spin_lock_bh(&hinfo->lock);
- hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
- spin_unlock_bh(&hinfo->lock);
+ tcf_hash_insert(tn, a);
- a->priv = police;
return ret;
failure_unlock:
@@ -253,7 +244,7 @@ failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
- kfree(police);
+ tcf_hash_cleanup(a, est);
return err;
}
@@ -268,6 +259,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
spin_lock(&police->tcf_lock);
bstats_update(&police->tcf_bstats, skb);
+ tcf_lastuse_update(&police->tcf_tm);
if (police->tcfp_ewma_rate &&
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
@@ -327,6 +319,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
.refcnt = police->tcf_refcnt - ref,
.bindcnt = police->tcf_bindcnt - bind,
};
+ struct tcf_t t;
if (police->rate_present)
psched_ratecfg_getrate(&opt.rate, &police->rate);
@@ -340,6 +333,13 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
if (police->tcfp_ewma_rate &&
nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
goto nla_put_failure;
+
+ t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
+ goto nla_put_failure;
+
return skb->len;
nla_put_failure:
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 730aacafc22d..b3b7978f4182 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -171,7 +171,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, 0))
+ if (!tc_should_offload(dev, tp, 0))
return;
offload.command = TC_CLSFLOWER_DESTROY;
@@ -194,7 +194,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, flags))
+ if (!tc_should_offload(dev, tp, flags))
return;
offload.command = TC_CLSFLOWER_REPLACE;
@@ -216,7 +216,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
- if (!tc_should_offload(dev, 0))
+ if (!tc_should_offload(dev, tp, 0))
return;
offload.command = TC_CLSFLOWER_STATS;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 079b43b3c5d2..ffe593efe930 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -440,7 +440,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, 0)) {
+ if (tc_should_offload(dev, tp, 0)) {
offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
offload.cls_u32->knode.handle = handle;
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -457,20 +457,21 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp,
struct tc_to_netdev offload;
int err;
+ if (!tc_should_offload(dev, tp, flags))
+ return tc_skip_sw(flags) ? -EINVAL : 0;
+
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, flags)) {
- offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
- offload.cls_u32->hnode.divisor = h->divisor;
- offload.cls_u32->hnode.handle = h->handle;
- offload.cls_u32->hnode.prio = h->prio;
+ offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+ offload.cls_u32->hnode.divisor = h->divisor;
+ offload.cls_u32->hnode.handle = h->handle;
+ offload.cls_u32->hnode.prio = h->prio;
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
- if (tc_skip_sw(flags))
- return err;
- }
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ if (tc_skip_sw(flags))
+ return err;
return 0;
}
@@ -484,7 +485,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, 0)) {
+ if (tc_should_offload(dev, tp, 0)) {
offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
offload.cls_u32->hnode.divisor = h->divisor;
offload.cls_u32->hnode.handle = h->handle;
@@ -507,27 +508,28 @@ static int u32_replace_hw_knode(struct tcf_proto *tp,
offload.type = TC_SETUP_CLSU32;
offload.cls_u32 = &u32_offload;
- if (tc_should_offload(dev, flags)) {
- offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
- offload.cls_u32->knode.handle = n->handle;
- offload.cls_u32->knode.fshift = n->fshift;
+ if (!tc_should_offload(dev, tp, flags))
+ return tc_skip_sw(flags) ? -EINVAL : 0;
+
+ offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+ offload.cls_u32->knode.handle = n->handle;
+ offload.cls_u32->knode.fshift = n->fshift;
#ifdef CONFIG_CLS_U32_MARK
- offload.cls_u32->knode.val = n->val;
- offload.cls_u32->knode.mask = n->mask;
+ offload.cls_u32->knode.val = n->val;
+ offload.cls_u32->knode.mask = n->mask;
#else
- offload.cls_u32->knode.val = 0;
- offload.cls_u32->knode.mask = 0;
+ offload.cls_u32->knode.val = 0;
+ offload.cls_u32->knode.mask = 0;
#endif
- offload.cls_u32->knode.sel = &n->sel;
- offload.cls_u32->knode.exts = &n->exts;
- if (n->ht_down)
- offload.cls_u32->knode.link_handle = n->ht_down->handle;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
- tp->protocol, &offload);
- if (tc_skip_sw(flags))
- return err;
- }
+ offload.cls_u32->knode.sel = &n->sel;
+ offload.cls_u32->knode.exts = &n->exts;
+ if (n->ht_down)
+ offload.cls_u32->knode.link_handle = n->ht_down->handle;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ if (tc_skip_sw(flags))
+ return err;
return 0;
}
@@ -863,7 +865,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (tb[TCA_U32_FLAGS]) {
flags = nla_get_u32(tb[TCA_U32_FLAGS]);
if (!tc_flags_valid(flags))
- return err;
+ return -EINVAL;
}
n = (struct tc_u_knode *)*arg;
@@ -921,11 +923,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
ht->divisor = divisor;
ht->handle = handle;
ht->prio = tp->prio;
+
+ err = u32_replace_hw_hnode(tp, ht, flags);
+ if (err) {
+ kfree(ht);
+ return err;
+ }
+
RCU_INIT_POINTER(ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, ht);
*arg = (unsigned long)ht;
- u32_replace_hw_hnode(tp, ht, flags);
return 0;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 64f71a2155f3..ddf047df5361 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -607,6 +607,10 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool thr
if (throttle)
qdisc_throttled(wd->qdisc);
+ if (wd->last_expires == expires)
+ return;
+
+ wd->last_expires = expires;
hrtimer_start(&wd->timer,
ns_to_ktime(expires),
HRTIMER_MODE_ABS_PINNED);
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a63e879e8975..bf8af2c43c2c 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -375,6 +375,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->deficit = cl->quantum;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return err;
}
@@ -407,6 +408,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
bstats_update(&cl->bstats, skb);
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -428,6 +430,7 @@ static unsigned int drr_drop(struct Qdisc *sch)
if (cl->qdisc->ops->drop) {
len = cl->qdisc->ops->drop(cl->qdisc);
if (len > 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
@@ -463,6 +466,7 @@ static void drr_reset_qdisc(struct Qdisc *sch)
qdisc_reset(cl->qdisc);
}
}
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6883a8971562..da250b2e06ae 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -199,6 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
+ unsigned int pkt_len;
bool memory_limited;
idx = fq_codel_classify(skb, sch, &ret);
@@ -230,6 +231,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
prev_backlog = sch->qstats.backlog;
prev_qlen = sch->q.qlen;
+ /* save this packet length as it might be dropped by fq_codel_drop() */
+ pkt_len = qdisc_pkt_len(skb);
/* fq_codel_drop() is quite expensive, as it performs a linear search
* in q->backlogs[] to find a fat flow.
* So instead of dropping a single packet, drop half of its backlog
@@ -237,14 +240,23 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
*/
ret = fq_codel_drop(sch, q->drop_batch_size);
- q->drop_overlimit += prev_qlen - sch->q.qlen;
+ prev_qlen -= sch->q.qlen;
+ prev_backlog -= sch->qstats.backlog;
+ q->drop_overlimit += prev_qlen;
if (memory_limited)
- q->drop_overmemory += prev_qlen - sch->q.qlen;
- /* As we dropped packet(s), better let upper stack know this */
- qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
- prev_backlog - sch->qstats.backlog);
+ q->drop_overmemory += prev_qlen;
- return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
+ /* As we dropped packet(s), better let upper stack know this.
+ * If we dropped a packet for this flow, return NET_XMIT_CN,
+ * but in this case, our parents wont increase their backlogs.
+ */
+ if (ret == idx) {
+ qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
+ prev_backlog - pkt_len);
+ return NET_XMIT_CN;
+ }
+ qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
+ return NET_XMIT_SUCCESS;
}
/* This is the specific function called from codel_dequeue()
@@ -649,7 +661,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
qs.backlog = q->backlogs[idx];
qs.drops = flow->dropped;
}
- if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
if (idx < q->flows_cnt)
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 269dd71b3828..f9e0e9c03d0a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -49,6 +49,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
q->gso_skb = skb;
q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
q->q.qlen++; /* it's still part of the queue */
__netif_schedule(q);
@@ -92,6 +93,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
+ qdisc_qstats_backlog_dec(q, skb);
q->q.qlen--;
} else
skb = NULL;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index d783d7cc3348..1ac9f9f03fe3 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1529,6 +1529,7 @@ hfsc_reset_qdisc(struct Qdisc *sch)
q->eligible = RB_ROOT;
INIT_LIST_HEAD(&q->droplist);
qdisc_watchdog_cancel(&q->watchdog);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
@@ -1559,14 +1560,6 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
struct hfsc_sched *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_hfsc_qopt qopt;
- struct hfsc_class *cl;
- unsigned int i;
-
- sch->qstats.backlog = 0;
- for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
- sch->qstats.backlog += cl->qdisc->qstats.backlog;
- }
qopt.defcls = q->defcls;
if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
@@ -1604,6 +1597,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl->qdisc->q.qlen == 1)
set_active(cl, qdisc_pkt_len(skb));
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1672,6 +1666,7 @@ hfsc_dequeue(struct Qdisc *sch)
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
@@ -1695,6 +1690,7 @@ hfsc_drop(struct Qdisc *sch)
}
cl->qstats.drops++;
qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index f6bf5818ed4d..d4b4218af6b1 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -928,17 +928,10 @@ ok:
}
}
qdisc_qstats_overlimit(sch);
- if (likely(next_event > q->now)) {
- if (!test_bit(__QDISC_STATE_DEACTIVATED,
- &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
- ktime_t time = ns_to_ktime(next_event);
- qdisc_throttled(q->watchdog.qdisc);
- hrtimer_start(&q->watchdog.timer, time,
- HRTIMER_MODE_ABS_PINNED);
- }
- } else {
+ if (likely(next_event > q->now))
+ qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true);
+ else
schedule_work(&q->work);
- }
fin:
return skb;
}
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 10adbc617905..8fe6999b642a 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -27,6 +27,11 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
return TC_H_MIN(classid) + 1;
}
+static bool ingress_cl_offload(u32 classid)
+{
+ return true;
+}
+
static unsigned long ingress_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
@@ -86,6 +91,7 @@ static const struct Qdisc_class_ops ingress_class_ops = {
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = ingress_find_tcf,
+ .tcf_cl_offload = ingress_cl_offload,
.bind_tcf = ingress_bind_filter,
.unbind_tcf = ingress_put,
};
@@ -110,6 +116,11 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
}
}
+static bool clsact_cl_offload(u32 classid)
+{
+ return TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS);
+}
+
static unsigned long clsact_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
@@ -158,6 +169,7 @@ static const struct Qdisc_class_ops clsact_class_ops = {
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = clsact_find_tcf,
+ .tcf_cl_offload = clsact_cl_offload,
.bind_tcf = clsact_bind_filter,
.unbind_tcf = ingress_put,
};
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fee1b15506b2..4b0a82191bc4 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -85,6 +85,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -117,6 +118,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -135,6 +137,7 @@ static unsigned int prio_drop(struct Qdisc *sch)
for (prio = q->bands-1; prio >= 0; prio--) {
qdisc = q->queues[prio];
if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
@@ -151,6 +154,7 @@ prio_reset(struct Qdisc *sch)
for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8d2d8d953432..f18857febdad 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1235,8 +1235,10 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
err = qfq_change_agg(sch, cl, cl->agg->class_weight,
qdisc_pkt_len(skb));
- if (err)
- return err;
+ if (err) {
+ cl->qstats.drops++;
+ return qdisc_drop(skb, sch);
+ }
}
err = qdisc_enqueue(skb, cl->qdisc);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 8c0508c0e287..91578bdd378c 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -97,6 +97,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -118,6 +119,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
skb = child->dequeue(child);
if (skb) {
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
} else {
if (!red_is_idling(&q->vars))
@@ -143,6 +145,7 @@ static unsigned int red_drop(struct Qdisc *sch)
if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
q->stats.other++;
qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= len;
sch->q.qlen--;
return len;
}
@@ -158,6 +161,7 @@ static void red_reset(struct Qdisc *sch)
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
red_restart(&q->vars);
}
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 83b90b584fae..3161e491990b 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -207,6 +207,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return ret;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -217,6 +218,7 @@ static unsigned int tbf_drop(struct Qdisc *sch)
unsigned int len = 0;
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
qdisc_qstats_drop(sch);
}
@@ -263,6 +265,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
q->t_c = now;
q->tokens = toks;
q->ptokens = ptoks;
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
@@ -294,6 +297,7 @@ static void tbf_reset(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
q->t_c = ktime_get_ns();
q->tokens = q->buffer;
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 8e3e769dc9ea..1ce724b87618 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -356,6 +356,9 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
if (cb->args[4] < cb->args[1])
goto next;
+ if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs))
+ goto next;
+
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
goto next;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 777d0324594a..67154b848aa9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4220,6 +4220,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
info->sctpi_s_disable_fragments = sp->disable_fragments;
info->sctpi_s_v4mapped = sp->v4mapped;
info->sctpi_s_frag_interleave = sp->frag_interleave;
+ info->sctpi_s_type = sp->type;
return 0;
}
diff --git a/net/socket.c b/net/socket.c
index e7793f5601ae..a1bd16106625 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
- struct timespec end_time;
+ struct timespec64 end_time;
+ struct timespec64 timeout64;
if (timeout &&
poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
flags |= MSG_DONTWAIT;
if (timeout) {
- ktime_get_ts(timeout);
- *timeout = timespec_sub(end_time, *timeout);
+ ktime_get_ts64(&timeout64);
+ *timeout = timespec64_to_timespec(
+ timespec64_sub(end_time, timeout64));
if (timeout->tv_sec < 0) {
timeout->tv_sec = timeout->tv_nsec = 0;
break;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..040ff627c18a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
*/
struct rpc_cred *
rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
- int flags)
+ int flags, gfp_t gfp)
{
LIST_HEAD(free);
struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
if (flags & RPCAUTH_LOOKUP_RCU)
return ERR_PTR(-ECHILD);
- new = auth->au_ops->crcreate(auth, acred, flags);
+ new = auth->au_ops->crcreate(auth, acred, flags, gfp);
if (IS_ERR(new)) {
cred = new;
goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
new = rpcauth_bind_new_cred(task, lookupflags);
if (IS_ERR(new))
return PTR_ERR(new);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
+ put_rpccred(req->rq_cred);
req->rq_cred = new;
return 0;
}
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
void
put_rpccred(struct rpc_cred *cred)
{
+ if (cred == NULL)
+ return;
/* Fast path for unhashed credentials */
if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
if (atomic_dec_and_test(&cred->cr_count))
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..54dd3fdead54 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
}
EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
struct rpc_cred *rpc_lookup_cred_nonblock(void)
{
return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
static struct rpc_cred *
generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
}
static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct generic_cred *gcred;
- gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+ gcred = kmalloc(sizeof(*gcred), gfp);
if (gcred == NULL)
return ERR_PTR(-ENOMEM);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8d57..e64ae93d5b4f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
static struct rpc_cred *
gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags);
+ return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
}
static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
struct gss_cred *cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
__func__, from_kuid(&init_user_ns, acred->uid),
auth->au_flavor);
- if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+ if (!(cred = kzalloc(sizeof(*cred), gfp)))
goto out_err;
rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1095be9c80ab..e085f5ae1548 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
struct rsc *found;
memset(&rsci, 0, sizeof(rsci));
- if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
- return NULL;
+ rsci.handle.data = handle->data;
+ rsci.handle.len = handle->len;
found = rsc_lookup(cd, &rsci);
- rsc_free(&rsci);
if (!found)
return NULL;
if (cache_check(cd, &found->h, NULL))
@@ -857,8 +856,8 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
goto out;
if (svc_getnl(&buf->head[0]) != seq)
goto out;
- /* trim off the mic at the end before returning */
- xdr_buf_trim(buf, mic.len + 4);
+ /* trim off the mic and padding at the end before returning */
+ xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
stat = 0;
out:
kfree(mic.data);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c22f..9f65452b7cbc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
static struct rpc_cred *
unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags);
+ return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
}
static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
{
struct unx_cred *cred;
unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
from_kuid(&init_user_ns, acred->uid),
from_kgid(&init_user_ns, acred->gid));
- if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+ if (!(cred = kmalloc(sizeof(*cred), gfp)))
return ERR_PTR(-ENOMEM);
rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf22df8..2808d550d273 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -446,16 +446,27 @@ out_no_rpciod:
return ERR_PTR(err);
}
-struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
struct rpc_xprt *xprt)
{
struct rpc_clnt *clnt = NULL;
struct rpc_xprt_switch *xps;
- xps = xprt_switch_alloc(xprt, GFP_KERNEL);
- if (xps == NULL)
- return ERR_PTR(-ENOMEM);
-
+ if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
+ WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ xps = args->bc_xprt->xpt_bc_xps;
+ xprt_switch_get(xps);
+ } else {
+ xps = xprt_switch_alloc(xprt, GFP_KERNEL);
+ if (xps == NULL) {
+ xprt_put(xprt);
+ return ERR_PTR(-ENOMEM);
+ }
+ if (xprt->bc_xprt) {
+ xprt_switch_get(xps);
+ xprt->bc_xprt->xpt_bc_xps = xps;
+ }
+ }
clnt = rpc_new_client(args, xps, xprt, NULL);
if (IS_ERR(clnt))
return clnt;
@@ -483,7 +494,6 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
return clnt;
}
-EXPORT_SYMBOL_GPL(rpc_create_xprt);
/**
* rpc_create - create an RPC client and transport with one call
@@ -509,6 +519,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
};
char servername[48];
+ if (args->bc_xprt) {
+ WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ xprt = args->bc_xprt->xpt_bc_xprt;
+ if (xprt) {
+ xprt_get(xprt);
+ return rpc_create_xprt(args, xprt);
+ }
+ }
+
if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
@@ -1414,6 +1433,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_payload);
/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ size_t ret;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ ret = xprt->ops->bc_maxpayload(xprt);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
+/**
* rpc_get_timeout - Get timeout for transport in units of HZ
* @clnt: RPC client to query
*/
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 7422f28818b2..4f01f63102ee 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -136,6 +136,8 @@ static void svc_xprt_free(struct kref *kref)
/* See comment on corresponding get in xs_setup_bc_tcp(): */
if (xprt->xpt_bc_xprt)
xprt_put(xprt->xpt_bc_xprt);
+ if (xprt->xpt_bc_xps)
+ xprt_switch_put(xprt->xpt_bc_xps);
xprt->xpt_ops->xpo_free(xprt);
module_put(owner);
}
@@ -244,13 +246,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
svc_xprt_received(new);
}
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
const unsigned short port, int flags)
{
struct svc_xprt_class *xcl;
- dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
spin_lock(&svc_xprt_class_lock);
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
struct svc_xprt *newxprt;
@@ -274,12 +275,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
}
err:
spin_unlock(&svc_xprt_class_lock);
- dprintk("svc: transport %s not found\n", xprt_name);
-
/* This errno is exposed to user space. Provide a reasonable
* perror msg for a bad transport. */
return -EPROTONOSUPPORT;
}
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, const int family,
+ const unsigned short port, int flags)
+{
+ int err;
+
+ dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ if (err == -EPROTONOSUPPORT) {
+ request_module("svc%s", xprt_name);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ }
+ if (err)
+ dprintk("svc: transport %s not found, err %d\n",
+ xprt_name, err);
+ return err;
+}
EXPORT_SYMBOL_GPL(svc_create_xprt);
/*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 6bdb3865212d..c4f3cc0c0775 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
xdr_set_iov(xdr, buf->head, buf->len);
else if (buf->page_len != 0)
xdr_set_page_base(xdr, 0, buf->len);
+ else
+ xdr_set_iov(xdr, buf->head, buf->len);
if (p != NULL && p > xdr->p && xdr->end >= p) {
xdr->nwords -= p - xdr->p;
xdr->p = p;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640eeb5..87762d976b63 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
}
/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ size_t maxmsg;
+
+ maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+/**
* rpcrdma_bc_marshal_reply - Send backwards direction reply
* @rqst: buffer containing RPC reply data
*
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index b289e106540b..6326ebe8b595 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -35,10 +35,71 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+ fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+ return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+ struct workqueue_struct *wq;
+
+ if (!fmr_recovery_wq)
+ return;
+
+ wq = fmr_recovery_wq;
+ fmr_recovery_wq = NULL;
+ destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+ LIST_HEAD(l);
+
+ list_add(&mw->fmr.fmr->list, &l);
+ return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+ mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+ __fmr_unmap(mw);
+ rpcrdma_put_mw(r_xprt, mw);
+ return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+ INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+ queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ RPCRDMA_MAX_FMR_SGES));
return 0;
}
@@ -48,7 +109,7 @@ static size_t
fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+ RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
if (IS_ERR(r->fmr.fmr))
goto out_fmr_err;
+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
}
@@ -104,15 +166,6 @@ out:
return rc;
}
-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
- LIST_HEAD(l);
-
- list_add(&r->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@@ -183,15 +236,10 @@ static void
__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
int nsegs = seg->mr_nsegs;
- seg->rl_mw = NULL;
-
while (nsegs--)
rpcrdma_unmap_one(device, seg++);
-
- rpcrdma_put_mw(r_xprt, mw);
}
/* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
seg = &req->rl_segments[i];
__fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, seg->rl_mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
}
req->rl_nchunks = 0;
}
-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
*/
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- int rc, nsegs = seg->mr_nsegs;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
- dprintk("RPC: %s: FMR %p\n", __func__, mw);
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
- seg1->rl_mw = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia->ri_device, seg++);
- rc = __fmr_unmap(mw);
- if (rc)
- goto out_err;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
+ if (sync) {
+ /* ORDER */
+ __fmr_unmap(mw);
+ __fmr_dma_unmap(r_xprt, seg);
+ rpcrdma_put_mw(r_xprt, mw);
+ } else {
+ __fmr_dma_unmap(r_xprt, seg);
+ __fmr_queue_recovery(mw);
+ }
-out_err:
- /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
- * will attempt to release it when the transport is destroyed.
- */
- dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
- return nsegs;
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
}
static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
- .ro_unmap = fmr_op_unmap,
+ .ro_unmap_safe = fmr_op_unmap_safe,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c250924a9fd3..c0947544babe 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
destroy_workqueue(wq);
}
+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ rc = ib_dereg_mr(f->fr_mr);
+ if (rc) {
+ pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+ rc, r);
+ return rc;
+ }
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_mr)) {
+ pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+ PTR_ERR(f->fr_mr), r);
+ return PTR_ERR(f->fr_mr);
+ }
+
+ dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
+ f->fr_state = FRMR_IS_INVALID;
+ return 0;
+}
+
+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_frmr *f = &mw->frmr;
+ int rc;
+
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+ if (rc)
+ return;
+
+ rpcrdma_put_mw(r_xprt, mw);
+}
+
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
*
@@ -109,26 +150,10 @@ static void
__frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- frmr.fr_work);
- struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
- if (ib_dereg_mr(r->frmr.fr_mr))
- goto out_fail;
+ mw_work);
- r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(r->frmr.fr_mr))
- goto out_fail;
-
- dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
- r->frmr.fr_state = FRMR_IS_INVALID;
- rpcrdma_put_mw(r_xprt, r);
+ __frwr_reset_and_unmap(r->mw_xprt, r);
return;
-
-out_fail:
- pr_warn("RPC: %s: FRMR %p unrecovered\n",
- __func__, r);
}
/* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
- INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+ INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->mw_work);
}
static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
if (IS_ERR(f->fr_mr))
goto out_mr_err;
- f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
- if (!f->sg)
+ f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+ if (!f->fr_sg)
goto out_list_err;
- sg_init_table(f->sg, depth);
+ sg_init_table(f->fr_sg, depth);
init_completion(&f->fr_linv_done);
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- kfree(r->frmr.sg);
+ kfree(r->frmr.fr_sg);
}
static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
depth;
}
+ rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+ RPCRDMA_MAX_DATA_SEGS /
+ ia->ri_max_frmr_depth));
return 0;
}
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+ RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
}
static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
return rc;
}
+ r->mw_xprt = r_xprt;
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
- r->frmr.fr_xprt = r_xprt;
}
return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->sg[i],
+ sg_set_page(&frmr->fr_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
seg->mr_len);
++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->sg_nents = i;
+ frmr->fr_nents = i;
+ frmr->fr_dir = direction;
- dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
if (!dma_nents) {
pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->sg, frmr->sg_nents);
+ __func__, frmr->fr_sg, frmr->fr_nents);
return -ENOMEM;
}
- n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
- if (unlikely(n != frmr->sg_nents)) {
+ n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+ if (unlikely(n != frmr->fr_nents)) {
pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->sg_nents);
+ __func__, frmr->fr_mr, n, frmr->fr_nents);
rc = n < 0 ? n : -EINVAL;
goto out_senderr;
}
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->sg_nents, mr->length);
+ __func__, mw, frmr->fr_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->mr_dir = direction;
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_nsegs = frmr->fr_nents;
seg1->mr_len = mr->length;
- return frmr->sg_nents;
+ return frmr->fr_nents;
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
__frwr_queue_recovery(mw);
return rc;
}
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
return invalidate_wr;
}
-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int rc)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- struct rpcrdma_mw *mw = seg->rl_mw;
- struct rpcrdma_frmr *f = &mw->frmr;
-
- seg->rl_mw = NULL;
-
- ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
-
- if (!rc)
- rpcrdma_put_mw(r_xprt, mw);
- else
- __frwr_queue_recovery(mw);
-}
-
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
struct rpcrdma_mr_seg *seg;
unsigned int i, nchunks;
struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless ri_id->qp is a valid pointer.
*/
rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
- if (rc) {
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- rdma_disconnect(ia->ri_id);
- goto unmap;
- }
+ if (rc)
+ goto reset_mrs;
wait_for_completion(&f->fr_linv_done);
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
unmap:
for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ seg->rl_mw = NULL;
- __frwr_dma_unmap(r_xprt, seg, rc);
+ ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+ f->fr_dir);
+ rpcrdma_put_mw(r_xprt, mw);
i += seg->mr_nsegs;
seg->mr_nsegs = 0;
}
req->rl_nchunks = 0;
-}
+ return;
-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mw *mw = seg1->rl_mw;
- struct rpcrdma_frmr *frmr = &mw->frmr;
- struct ib_send_wr *invalidate_wr, *bad_wr;
- int rc, nsegs = seg->mr_nsegs;
+reset_mrs:
+ pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
- dprintk("RPC: %s: FRMR %p\n", __func__, mw);
+ /* Find and reset the MRs in the LOCAL_INV WRs that did not
+ * get posted. This is synchronous, and slow.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+ f = &mw->frmr;
- seg1->rl_mw = NULL;
- frmr->fr_state = FRMR_IS_INVALID;
- invalidate_wr = &mw->frmr.fr_invwr;
+ if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+ __frwr_reset_mr(ia, mw);
+ bad_wr = bad_wr->next;
+ }
- memset(invalidate_wr, 0, sizeof(*invalidate_wr));
- frmr->fr_cqe.done = frwr_wc_localinv;
- invalidate_wr->wr_cqe = &frmr->fr_cqe;
- invalidate_wr->opcode = IB_WR_LOCAL_INV;
- invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
+ i += seg->mr_nsegs;
+ }
+ goto unmap;
+}
- ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
- read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
- read_unlock(&ia->ri_qplock);
- if (rc)
- goto out_err;
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
+ unsigned int i;
- rpcrdma_put_mw(r_xprt, mw);
- return nsegs;
+ for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
-out_err:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return nsegs;
+ if (sync)
+ __frwr_reset_and_unmap(r_xprt, mw);
+ else
+ __frwr_queue_recovery(mw);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ seg->rl_mw = NULL;
+ }
}
static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
- .ro_unmap = frwr_op_unmap,
+ .ro_unmap_safe = frwr_op_unmap_safe,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 481b9b6f4a15..3750596cc432 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
__func__, PTR_ERR(mr));
return -ENOMEM;
}
-
ia->ri_dma_mr = mr;
+
+ rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ RPCRDMA_MAX_HDR_SEGS));
return 0;
}
@@ -47,7 +50,7 @@ static size_t
physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- rpcrdma_max_segments(r_xprt));
+ RPCRDMA_MAX_HDR_SEGS);
}
static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
return 1;
}
-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- rpcrdma_unmap_one(ia->ri_device, seg);
- return 1;
-}
-
/* DMA unmap all memory regions that were mapped for "req".
*/
static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
rpcrdma_unmap_one(device, &req->rl_segments[i++]);
}
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ bool sync)
+{
+ physical_op_unmap_sync(r_xprt, req);
+}
+
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
.ro_unmap_sync = physical_op_unmap_sync,
- .ro_unmap = physical_op_unmap,
+ .ro_unmap_safe = physical_op_unmap_safe,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 888823bb6dae..35a81096e83d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
rpcrdma_replych
};
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static const char transfertypes[][12] = {
- "pure inline", /* no chunks */
- " read chunk", /* some argument via rdma read */
- "*read chunk", /* entire request via rdma read */
- "write chunk", /* some result via rdma write */
+ "inline", /* no chunks */
+ "read list", /* some argument via rdma read */
+ "*read list", /* entire request via rdma read */
+ "write list", /* some result via rdma write */
"reply chunk" /* entire reply via rdma write */
};
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Read list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+ /* Minimal Read chunk size */
+ size += sizeof(__be32); /* segment count */
+ size += sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max call header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message. The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+ unsigned int size;
+
+ /* Fixed header fields and list discriminators */
+ size = RPCRDMA_HDRLEN_MIN;
+
+ /* Maximum Write list size */
+ maxsegs += 2; /* segment for head and tail buffers */
+ size = sizeof(__be32); /* segment count */
+ size += maxsegs * sizeof(struct rpcrdma_segment);
+ size += sizeof(__be32); /* list discriminator */
+
+ dprintk("RPC: %s: max reply header size = %u\n",
+ __func__, size);
+ return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+ struct rpcrdma_create_data_internal *cdata,
+ unsigned int maxsegs)
+{
+ ia->ri_max_inline_write = cdata->inline_wsize -
+ rpcrdma_max_call_header_size(maxsegs);
+ ia->ri_max_inline_read = cdata->inline_rsize -
+ rpcrdma_max_reply_header_size(maxsegs);
+}
/* The client can send a request inline as long as the RPCRDMA header
* plus the RPC call fit under the transport's inline limit. If the
* combined call message size exceeds that limit, the client must use
* the read chunk list for this operation.
*/
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
}
/* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
* limit, the client must provide a write list or a reply chunk for
* this request.
*/
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
{
- unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
}
-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- * Assume check against THRESHOLD has been done, and chunks are required.
- * Assume only encoding one list entry for read|write chunks. The NFSv3
- * protocol is simple enough to allow this as it only has a single "bulk
- * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+ *iptr++ = cpu_to_be32(seg->mr_rkey);
+ *iptr++ = cpu_to_be32(seg->mr_len);
+ return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* N elements, position P (same P for all chunks of same arg!):
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
*
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ unsigned int pos;
+ int n, nsegs;
+
+ if (rtype == rpcrdma_noch) {
+ *iptr++ = xdr_zero; /* item not present */
+ return iptr;
+ }
+
+ pos = rqst->rq_snd_buf.head[0].iov_len;
+ if (rtype == rpcrdma_areadch)
+ pos = 0;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ *iptr++ = xdr_one; /* item present */
+
+ /* All read segments in this chunk
+ * have the same "position".
+ */
+ *iptr++ = cpu_to_be32(pos);
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: read segment pos %u "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__, pos,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.read_chunk_count++;
+ req->rl_nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Finish Read list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
* Write chunklist (a list of (one) counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO - 0
*
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, __be32 *iptr,
+ enum rpcrdma_chunktype wtype)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
+
+ if (wtype != rpcrdma_writech) {
+ *iptr++ = xdr_zero; /* no Write list present */
+ return iptr;
+ }
+
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+ rqst->rq_rcv_buf.head[0].iov_len,
+ wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ if (nsegs < 0)
+ return ERR_PTR(nsegs);
+
+ *iptr++ = xdr_one; /* Write list present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
+ do {
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+ if (n <= 0)
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: write segment "
+ "%d@0x016%llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.write_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
+ nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+ req->rl_nextseg = seg;
+
+ /* Update count of segments in this Write chunk */
+ *segcount = cpu_to_be32(nchunks);
+
+ /* Finish Write list */
+ *iptr++ = xdr_zero; /* Next item not present */
+ return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
* Reply chunk (a counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
*
- * Returns positive RPC/RDMA header size, or negative errno.
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
*/
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
- struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, struct rpc_rqst *rqst,
+ __be32 *iptr, enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int n, nsegs, nchunks = 0;
- unsigned int pos;
- struct rpcrdma_mr_seg *seg = req->rl_segments;
- struct rpcrdma_read_chunk *cur_rchunk = NULL;
- struct rpcrdma_write_array *warray = NULL;
- struct rpcrdma_write_chunk *cur_wchunk = NULL;
- __be32 *iptr = headerp->rm_body.rm_chunks;
- int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
- if (type == rpcrdma_readch || type == rpcrdma_areadch) {
- /* a read chunk - server will RDMA Read our memory */
- cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
- } else {
- /* a write or reply chunk - server will RDMA Write our memory */
- *iptr++ = xdr_zero; /* encode a NULL read chunk list */
- if (type == rpcrdma_replych)
- *iptr++ = xdr_zero; /* a NULL write chunk list */
- warray = (struct rpcrdma_write_array *) iptr;
- cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
- }
+ struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ int n, nsegs, nchunks;
+ __be32 *segcount;
- if (type == rpcrdma_replych || type == rpcrdma_areadch)
- pos = 0;
- else
- pos = target->head[0].iov_len;
+ if (wtype != rpcrdma_replych) {
+ *iptr++ = xdr_zero; /* no Reply chunk present */
+ return iptr;
+ }
- nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+ RPCRDMA_MAX_SEGS - req->rl_nchunks);
if (nsegs < 0)
- return nsegs;
+ return ERR_PTR(nsegs);
- map = r_xprt->rx_ia.ri_ops->ro_map;
+ *iptr++ = xdr_one; /* Reply chunk present */
+ segcount = iptr++; /* save location of segment count */
+
+ nchunks = 0;
do {
- n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
if (n <= 0)
- goto out;
- if (cur_rchunk) { /* read */
- cur_rchunk->rc_discrim = xdr_one;
- /* all read chunks have the same "position" */
- cur_rchunk->rc_position = cpu_to_be32(pos);
- cur_rchunk->rc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_rchunk->rc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_rchunk->rc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: read chunk "
- "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, pos, n < nsegs ? "more" : "last");
- cur_rchunk++;
- r_xprt->rx_stats.read_chunk_count++;
- } else { /* write/reply */
- cur_wchunk->wc_target.rs_handle =
- cpu_to_be32(seg->mr_rkey);
- cur_wchunk->wc_target.rs_length =
- cpu_to_be32(seg->mr_len);
- xdr_encode_hyper(
- (__be32 *)&cur_wchunk->wc_target.rs_offset,
- seg->mr_base);
- dprintk("RPC: %s: %s chunk "
- "elem %d@0x%llx:0x%x (%s)\n", __func__,
- (type == rpcrdma_replych) ? "reply" : "write",
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
- cur_wchunk++;
- if (type == rpcrdma_replych)
- r_xprt->rx_stats.reply_chunk_count++;
- else
- r_xprt->rx_stats.write_chunk_count++;
- r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- }
+ return ERR_PTR(n);
+
+ iptr = xdr_encode_rdma_segment(iptr, seg);
+
+ dprintk("RPC: %5u %s: reply segment "
+ "%d@0x%016llx:0x%08x (%s)\n",
+ rqst->rq_task->tk_pid, __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+
+ r_xprt->rx_stats.reply_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
+ req->rl_nextseg = seg;
- /* success. all failures return above */
- req->rl_nchunks = nchunks;
-
- /*
- * finish off header. If write, marshal discrim and nchunks.
- */
- if (cur_rchunk) {
- iptr = (__be32 *) cur_rchunk;
- *iptr++ = xdr_zero; /* finish the read chunk list */
- *iptr++ = xdr_zero; /* encode a NULL write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- } else {
- warray->wc_discrim = xdr_one;
- warray->wc_nchunks = cpu_to_be32(nchunks);
- iptr = (__be32 *) cur_wchunk;
- if (type == rpcrdma_writech) {
- *iptr++ = xdr_zero; /* finish the write chunk list */
- *iptr++ = xdr_zero; /* encode a NULL reply chunk */
- }
- }
-
- /*
- * Return header size.
- */
- return (unsigned char *)iptr - (unsigned char *)headerp;
+ /* Update count of segments in the Reply chunk */
+ *segcount = cpu_to_be32(nchunks);
-out:
- for (pos = 0; nchunks--;)
- pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[pos]);
- return n;
+ return iptr;
}
/*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
* Marshal a request: the primary job of this routine is to choose
* the transfer modes. See comments below.
*
- * Uses multiple RDMA IOVs for a request:
- * [0] -- RPC RDMA header, which uses memory from the *start* of the
- * preregistered buffer that already holds the RPC data in
- * its middle.
- * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- * [2] -- optional padding.
- * [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ * [0] -- RPC RDMA header
+ * [1] -- the RPC header/data
*
* Returns zero on success, otherwise a negative errno.
*/
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- char *base;
- size_t rpclen;
- ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ ssize_t hdrlen;
+ size_t rpclen;
+ __be32 *iptr;
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
return rpcrdma_bc_marshal_reply(rqst);
#endif
- /*
- * rpclen gets amount of data in first buffer, which is the
- * pre-registered buffer.
- */
- base = rqst->rq_svec[0].iov_base;
- rpclen = rqst->rq_svec[0].iov_len;
-
headerp = rdmab_to_msg(req->rl_rdmabuf);
/* don't byte-swap XID, it's already done in request */
headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/*
* Chunks needed for results?
*
- * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
* return as inline.
+ * o Large read ops return data as write chunk(s), header as
+ * inline.
* o Large non-read ops return as a single reply chunk.
*/
- if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- wtype = rpcrdma_writech;
- else if (rpcrdma_results_inline(rqst))
+ if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
+ else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* that both has a data payload, and whose non-data arguments
* by themselves are larger than the inline threshold.
*/
- if (rpcrdma_args_inline(rqst)) {
+ if (rpcrdma_args_inline(r_xprt, rqst)) {
rtype = rpcrdma_noch;
+ rpcrdma_inline_pullup(rqst);
+ rpclen = rqst->rq_svec[0].iov_len;
} else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ rpclen = rqst->rq_svec[0].iov_len;
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
} else {
r_xprt->rx_stats.nomsg_call_count++;
headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rpclen = 0;
}
- /* The following simplification is not true forever */
- if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
- wtype = rpcrdma_noch;
- if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
- dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
- __func__);
- return -EIO;
- }
-
- hdrlen = RPCRDMA_HDRLEN_MIN;
-
- /*
- * Pull up any extra send data into the preregistered buffer.
- * When padding is in use and applies to the transfer, insert
- * it and change the message type.
+ /* This implementation supports the following combinations
+ * of chunk lists in one RPC-over-RDMA Call message:
+ *
+ * - Read list
+ * - Write list
+ * - Reply chunk
+ * - Read list + Reply chunk
+ *
+ * It might not yet support the following combinations:
+ *
+ * - Read list + Write list
+ *
+ * It does not support the following combinations:
+ *
+ * - Write list + Reply chunk
+ * - Read list + Write list + Reply chunk
+ *
+ * This implementation supports only a single chunk in each
+ * Read or Write list. Thus for example the client cannot
+ * send a Call message with a Position Zero Read chunk and a
+ * regular Read chunk at the same time.
*/
- if (rtype == rpcrdma_noch) {
-
- rpcrdma_inline_pullup(rqst);
-
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- } else if (rtype == rpcrdma_readch)
- rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
- if (rtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
- headerp, rtype);
- wtype = rtype; /* simplify dprintk */
-
- } else if (wtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
- headerp, wtype);
- }
- if (hdrlen < 0)
- return hdrlen;
+ req->rl_nchunks = 0;
+ req->rl_nextseg = req->rl_segments;
+ iptr = headerp->rm_body.rm_chunks;
+ iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+ if (IS_ERR(iptr))
+ goto out_unmap;
+ hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
+
+ if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ goto out_overflow;
+
+ dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+ rqst->rq_task->tk_pid, __func__,
+ transfertypes[rtype], transfertypes[wtype],
+ hdrlen, rpclen);
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
- " headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen,
- headerp, base, rdmab_lkey(req->rl_rdmabuf));
-
- /*
- * initialize send_iov's - normally only two: rdma chunk header and
- * single preregistered RPC header buffer, but if padding is present,
- * then use a preregistered (and zeroed) pad buffer between the RPC
- * header and any write data. In all non-rdma cases, any following
- * data has been copied into the RPC header buffer.
- */
req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
req->rl_niovs = 2;
return 0;
+
+out_overflow:
+ pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+ hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
+ /* Terminate this RPC. Chunks registered above will be
+ * released by xprt_release -> xprt_rmda_free .
+ */
+ return -EIO;
+
+out_unmap:
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+ return PTR_ERR(iptr);
}
/*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 765bca47c74d..0ba9887f3e22 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
return (__be32 *)&ary->wc_array[nchunks];
}
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
{
+ struct rpcrdma_msg *rmsgp;
__be32 *va, *vaend;
unsigned int len;
u32 hdr_len;
/* Verify that there's enough bytes for header + something */
- if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
dprintk("svcrdma: header too short = %d\n",
- rqstp->rq_arg.len);
+ rq_arg->len);
return -EINVAL;
}
+ rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
if (rmsgp->rm_vers != rpcrdma_version) {
dprintk("%s: bad version %u\n", __func__,
be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rqstp->rq_arg.head[0].iov_base = va;
+ rq_arg->head[0].iov_base = va;
len = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rqstp->rq_arg.head[0].iov_len -= len;
- if (len > rqstp->rq_arg.len)
+ rq_arg->head[0].iov_len -= len;
+ if (len > rq_arg->len)
return -EINVAL;
return len;
default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
* chunk list and a reply chunk list.
*/
va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
va = decode_read_list(va, vaend);
if (!va) {
dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
return -EINVAL;
}
- rqstp->rq_arg.head[0].iov_base = va;
+ rq_arg->head[0].iov_base = va;
hdr_len = (unsigned long)va - (unsigned long)rmsgp;
- rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+ rq_arg->head[0].iov_len -= hdr_len;
return hdr_len;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 3b24a646eb46..2c25606f2561 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -281,7 +281,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
}
atomic_inc(&xprt->sc_dma_used);
- n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
if (unlikely(n != frmr->sg_nents)) {
pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
frmr->mr, n, frmr->sg_nents);
@@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
head->arg.len = rqstp->rq_arg.len;
head->arg.buflen = rqstp->rq_arg.buflen;
- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- position = be32_to_cpu(ch->rc_position);
-
/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ position = be32_to_cpu(ch->rc_position);
if (position == 0) {
head->arg.pages = &head->pages[0];
page_offset = head->byte_len;
@@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
if (page_offset & 3) {
u32 pad = 4 - (page_offset & 3);
- head->arg.page_len += pad;
+ head->arg.tail[0].iov_len += pad;
head->arg.len += pad;
head->arg.buflen += pad;
page_offset += pad;
@@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
return ret;
}
-static int rdma_read_complete(struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head)
+static void rdma_read_complete(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head)
{
int page_no;
- int ret;
/* Copy RPC pages */
for (page_no = 0; page_no < head->count; page_no++) {
@@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0] = head->arg.tail[0];
rqstp->rq_arg.len = head->arg.len;
rqstp->rq_arg.buflen = head->arg.buflen;
-
- /* Free the context */
- svc_rdma_put_context(head, 0);
-
- /* XXX: What should this be? */
- rqstp->rq_prot = IPPROTO_MAX;
- svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
-
- ret = rqstp->rq_arg.head[0].iov_len
- + rqstp->rq_arg.page_len
- + rqstp->rq_arg.tail[0].iov_len;
- dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
- "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
- ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
- rqstp->rq_arg.head[0].iov_len);
-
- return ret;
}
/* By convention, backchannel calls arrive via rdma_msg type
@@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dto_q);
list_del_init(&ctxt->dto_q);
spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
- return rdma_read_complete(rqstp, ctxt);
+ rdma_read_complete(rqstp, ctxt);
+ goto complete;
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
struct svc_rdma_op_ctxt,
@@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
/* Decode the RDMA header. */
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
- ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+ ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
if (ret < 0)
goto out_err;
if (ret == 0)
@@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
return 0;
}
+complete:
ret = rqstp->rq_arg.head[0].iov_len
+ rqstp->rq_arg.page_len
+ rqstp->rq_arg.tail[0].iov_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4f1b1c4f45f9..54d533300620 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma,
struct svc_rqst *rqstp,
struct page *page,
struct rpcrdma_msg *rdma_resp,
- struct svc_rdma_op_ctxt *ctxt,
struct svc_rdma_req_map *vec,
int byte_count)
{
+ struct svc_rdma_op_ctxt *ctxt;
struct ib_send_wr send_wr;
u32 xdr_off;
int sge_no;
int sge_bytes;
int page_no;
int pages;
- int ret;
-
- ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
- if (ret) {
- svc_rdma_put_context(ctxt, 0);
- return -ENOTCONN;
- }
+ int ret = -EIO;
/* Prepare the context */
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->direction = DMA_TO_DEVICE;
ctxt->pages[0] = page;
ctxt->count = 1;
@@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
err:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
- pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
- return -EIO;
+ return ret;
}
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
int ret;
int inline_bytes;
struct page *res_page;
- struct svc_rdma_op_ctxt *ctxt;
struct svc_rdma_req_map *vec;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
/* Build an req vec for the XDR */
- ctxt = svc_rdma_get_context(rdma);
- ctxt->direction = DMA_TO_DEVICE;
vec = svc_rdma_get_req_map(rdma);
ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
if (ret)
@@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
inline_bytes -= ret;
}
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+ /* Post a fresh Receive buffer _before_ sending the reply */
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+ if (ret)
+ goto err1;
+
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
inline_bytes);
if (ret < 0)
goto err1;
@@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
put_page(res_page);
err0:
svc_rdma_put_req_map(rdma, vec);
- svc_rdma_put_context(ctxt, 0);
+ pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+ ret);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
return -ENOTCONN;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 90668969d559..dd9440137834 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
int ret;
dprintk("svcrdma: Creating RDMA socket\n");
- if (sa->sa_family != AF_INET) {
+ if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
return ERR_PTR(-EAFNOSUPPORT);
}
@@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
goto err0;
}
+ /* Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = rdma_set_afonly(listen_id, 1);
+ if (ret) {
+ dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+ goto err1;
+ }
+#endif
ret = rdma_bind_addr(listen_id, sa);
if (ret) {
dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
@@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
/* Post receive buffers */
- for (i = 0; i < newxprt->sc_rq_depth; i++) {
+ for (i = 0; i < newxprt->sc_max_requests; i++) {
ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
if (ret) {
dprintk("svcrdma: failure posting receive buffers\n");
@@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work)
dprintk("svcrdma: %s(%p)\n", __func__, rdma);
+ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+ ib_drain_qp(rdma->sc_qp);
+
/* We should only be called from kref_put */
if (atomic_read(&xprt->xpt_ref.refcount) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10ea3..99d2e5b72726 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
static unsigned int zero;
static unsigned int max_padding = PAGE_SIZE;
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
+ .extra1 = &min_inline_size,
+ .extra2 = &max_inline_size,
},
{
.procname = "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
out:
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
req->rl_connect_cookie = 0; /* our reserved value */
+ req->rl_task = task;
return req->rl_sendbuf->rg_base;
out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
struct rpcrdma_req *req;
struct rpcrdma_xprt *r_xprt;
struct rpcrdma_regbuf *rb;
- int i;
if (buffer == NULL)
return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- for (i = 0; req->rl_nchunks;) {
- --req->rl_nchunks;
- i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
- &req->rl_segments[i]);
- }
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+ !RPC_IS_ASYNC(req->rl_task));
rpcrdma_buffer_put(req);
}
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
.bc_up = xprt_rdma_bc_up,
+ .bc_maxpayload = xprt_rdma_bc_maxpayload,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f5ed9f982cd7..b044d98a1370 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -203,15 +203,6 @@ out_fail:
goto out_schedule;
}
-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
- struct ib_wc wc;
-
- while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_receive_wc(NULL, &wc);
-}
-
static int
rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -374,23 +365,6 @@ out:
}
/*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
- struct ib_wc wc;
- int count = 0;
-
- while (1 == ib_poll_cq(cq, 1, &wc))
- ++count;
-
- if (count)
- dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
- __func__, count, wc.opcode);
-}
-
-/*
* Exported functions.
*/
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
- rwlock_init(&ia->ri_qplock);
return 0;
out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
__func__);
return -ENOMEM;
}
- max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
/* check provider's send/recv wr limits */
if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */
+ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
/* RPC/RDMA does not use private data */
ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_remote_cma.responder_resources =
ia->ri_device->attrs.max_qp_rd_atom;
- ep->rep_remote_cma.retry_count = 7;
+ /* Limit transport retries so client can detect server
+ * GID changes quickly. RPC layer handles re-establishing
+ * transport connection and retransmission.
+ */
+ ep->rep_remote_cma.retry_count = 6;
+
+ /* RPC-over-RDMA handles its own flow control. In addition,
+ * make all RNR NAKs visible so we know that RPC-over-RDMA
+ * flow control is working correctly (no NAKs should be seen).
+ */
ep->rep_remote_cma.flow_control = 0;
ep->rep_remote_cma.rnr_retry_count = 0;
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
- if (ia->ri_id->qp)
- rpcrdma_ep_disconnect(ep, ia);
-
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
if (ia->ri_id->qp) {
+ rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -659,7 +639,6 @@ retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
rpcrdma_ep_disconnect(ep, ia);
- rpcrdma_flush_cqs(ep);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
goto out;
}
- write_lock(&ia->ri_qplock);
old = ia->ri_id;
ia->ri_id = id;
- write_unlock(&ia->ri_qplock);
rdma_destroy_qp(old);
rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
+
+ ib_drain_qp(ia->ri_id->qp);
}
struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
rpcrdma_recv_buffer_put(rep);
return rc;
}
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- int bytes, segments;
-
- bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
- bytes -= RPCRDMA_HDRLEN_MIN;
- if (bytes < sizeof(struct rpcrdma_segment) * 2) {
- pr_warn("RPC: %s: inline threshold too small\n",
- __func__);
- return 0;
- }
-
- segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
- dprintk("RPC: %s: max chunk list size = %d segments\n",
- __func__, segments);
- return segments;
-}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb96f..95cdc66225ee 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,7 +65,6 @@
*/
struct rpcrdma_ia {
const struct rpcrdma_memreg_ops *ri_ops;
- rwlock_t ri_qplock;
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
+ unsigned int ri_max_inline_write;
+ unsigned int ri_max_inline_read;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS (8)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
*/
#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
-#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+/* data segments + head/tail for Call + head/tail for Reply */
+#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
struct rpcrdma_buffer;
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct scatterlist *sg;
- int sg_nents;
+ struct scatterlist *fr_sg;
+ int fr_nents;
+ enum dma_data_direction fr_dir;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
struct completion fr_linv_done;
- struct work_struct fr_work;
- struct rpcrdma_xprt *fr_xprt;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
+ struct work_struct mw_work;
+ struct rpcrdma_xprt *mw_xprt;
struct list_head mw_list;
struct list_head mw_all;
};
@@ -270,12 +294,14 @@ struct rpcrdma_req {
unsigned int rl_niovs;
unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
+ struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe;
struct list_head rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_mr_seg *, int, bool);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
- int (*ro_unmap)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *);
+ void (*ro_unmap_safe)(struct rpcrdma_xprt *,
+ struct rpcrdma_req *, bool);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+ struct rpcrdma_create_data_internal *,
+ unsigned int);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b90c5397b5e1..7e2b2fa189c3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1364,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
return ret;
return 0;
}
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+ return PAGE_SIZE;
+}
#else
static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
@@ -2661,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
#ifdef CONFIG_SUNRPC_BACKCHANNEL
.bc_setup = xprt_setup_bc,
.bc_up = xs_tcp_bc_up,
+ .bc_maxpayload = xs_tcp_bc_maxpayload,
.bc_free_rqst = xprt_free_bc_rqst,
.bc_destroy = xprt_destroy_bc,
#endif
@@ -3051,6 +3057,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
return xprt;
args->bc_xprt->xpt_bc_xprt = NULL;
+ args->bc_xprt->xpt_bc_xps = NULL;
xprt_put(xprt);
ret = ERR_PTR(-EINVAL);
out_err:
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 4dfc5c14f8c3..3ad9fab1985f 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -346,9 +346,15 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg,
struct nlattr **attrs)
{
struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1];
+ int err;
+
+ if (!attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
- nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER],
- NULL);
+ err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX,
+ attrs[TIPC_NLA_BEARER], NULL);
+ if (err)
+ return err;
return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME,
nla_data(bearer[TIPC_NLA_BEARER_NAME]),
@@ -460,14 +466,31 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
struct nlattr *prop[TIPC_NLA_PROP_MAX + 1];
struct nlattr *stats[TIPC_NLA_STATS_MAX + 1];
+ int err;
- nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+ if (!attrs[TIPC_NLA_LINK])
+ return -EINVAL;
- nla_parse_nested(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP],
- NULL);
+ err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+ NULL);
+ if (err)
+ return err;
+
+ if (!link[TIPC_NLA_LINK_PROP])
+ return -EINVAL;
- nla_parse_nested(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS],
- NULL);
+ err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX,
+ link[TIPC_NLA_LINK_PROP], NULL);
+ if (err)
+ return err;
+
+ if (!link[TIPC_NLA_LINK_STATS])
+ return -EINVAL;
+
+ err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX,
+ link[TIPC_NLA_LINK_STATS], NULL);
+ if (err)
+ return err;
name = (char *)TLV_DATA(msg->req);
if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
@@ -569,12 +592,20 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
{
struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
struct tipc_link_info link_info;
+ int err;
- nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+ if (!attrs[TIPC_NLA_LINK])
+ return -EINVAL;
+
+ err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+ NULL);
+ if (err)
+ return err;
link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
- strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]));
+ nla_strlcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]),
+ TIPC_MAX_LINK_NAME);
return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
&link_info, sizeof(link_info));
@@ -758,12 +789,23 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg,
u32 node, depth, type, lowbound, upbound;
static const char * const scope_str[] = {"", " zone", " cluster",
" node"};
+ int err;
- nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
- attrs[TIPC_NLA_NAME_TABLE], NULL);
+ if (!attrs[TIPC_NLA_NAME_TABLE])
+ return -EINVAL;
- nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL],
- NULL);
+ err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
+ attrs[TIPC_NLA_NAME_TABLE], NULL);
+ if (err)
+ return err;
+
+ if (!nt[TIPC_NLA_NAME_TABLE_PUBL])
+ return -EINVAL;
+
+ err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX,
+ nt[TIPC_NLA_NAME_TABLE_PUBL], NULL);
+ if (err)
+ return err;
ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
@@ -815,8 +857,15 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg,
{
u32 type, lower, upper;
struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1];
+ int err;
- nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL);
+ if (!attrs[TIPC_NLA_PUBL])
+ return -EINVAL;
+
+ err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL],
+ NULL);
+ if (err)
+ return err;
type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]);
lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]);
@@ -876,7 +925,13 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg,
u32 sock_ref;
struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
- nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL);
+ if (!attrs[TIPC_NLA_SOCK])
+ return -EINVAL;
+
+ err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK],
+ NULL);
+ if (err)
+ return err;
sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
tipc_tlv_sprintf(msg->rep, "%u:", sock_ref);
@@ -917,9 +972,15 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg,
struct nlattr **attrs)
{
struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1];
+ int err;
+
+ if (!attrs[TIPC_NLA_MEDIA])
+ return -EINVAL;
- nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
- NULL);
+ err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
+ NULL);
+ if (err)
+ return err;
return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME,
nla_data(media[TIPC_NLA_MEDIA_NAME]),
@@ -931,8 +992,15 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg,
{
struct tipc_node_info node_info;
struct nlattr *node[TIPC_NLA_NODE_MAX + 1];
+ int err;
- nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL);
+ if (!attrs[TIPC_NLA_NODE])
+ return -EINVAL;
+
+ err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE],
+ NULL);
+ if (err)
+ return err;
node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR]));
node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP]));
@@ -971,8 +1039,16 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg,
{
__be32 id;
struct nlattr *net[TIPC_NLA_NET_MAX + 1];
+ int err;
+
+ if (!attrs[TIPC_NLA_NET])
+ return -EINVAL;
+
+ err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET],
+ NULL);
+ if (err)
+ return err;
- nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL);
id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID]));
return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id));
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 7a0af2dc0406..272d20a795d5 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -138,28 +138,28 @@ static void sock_data_ready(struct sock *sk)
{
struct tipc_conn *con;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con && test_bit(CF_CONNECTED, &con->flags)) {
conn_get(con);
if (!queue_work(con->server->rcv_wq, &con->rwork))
conn_put(con);
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void sock_write_space(struct sock *sk)
{
struct tipc_conn *con;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con && test_bit(CF_CONNECTED, &con->flags)) {
conn_get(con);
if (!queue_work(con->server->send_wq, &con->swork))
conn_put(con);
}
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 80aa6a3e6817..735362c26c8e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -315,7 +315,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
- if (dentry && d_backing_inode(dentry) == i) {
+ if (dentry && d_real_inode(dentry) == i) {
sock_hold(s);
goto found;
}
@@ -911,7 +911,7 @@ static struct sock *unix_find_other(struct net *net,
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
- inode = d_backing_inode(path.dentry);
+ inode = d_real_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
@@ -1048,7 +1048,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+ hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
u->path = u_path;
list = &unix_socket_table[hash];
diff --git a/net/wireless/core.c b/net/wireless/core.c
index d25c82bc1bbe..ecca3896b9f7 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -363,8 +363,6 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel);
WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch);
WARN_ON(ops->add_tx_ts && !ops->del_tx_ts);
- WARN_ON(ops->set_tx_power && !ops->get_tx_power);
- WARN_ON(ops->set_antenna && !ops->get_antenna);
alloc_size = sizeof(*rdev) + sizeof_priv;
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 219bd197039e..4e809e978b7d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
struct skb_shared_info *sh = skb_shinfo(skb);
int page_offset;
- atomic_inc(&page->_count);
+ page_ref_inc(page);
page_offset = ptr - page_address(page);
skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
}
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6250b1cfcde5..dbb2738e356a 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -958,8 +958,29 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
return private(dev, iwr, cmd, info, handler);
}
/* Old driver API : call driver ioctl handler */
- if (dev->netdev_ops->ndo_do_ioctl)
- return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+ if (dev->netdev_ops->ndo_do_ioctl) {
+#ifdef CONFIG_COMPAT
+ if (info->flags & IW_REQUEST_FLAG_COMPAT) {
+ int ret = 0;
+ struct iwreq iwr_lcl;
+ struct compat_iw_point *iwp_compat = (void *) &iwr->u.data;
+
+ memcpy(&iwr_lcl, iwr, sizeof(struct iwreq));
+ iwr_lcl.u.data.pointer = compat_ptr(iwp_compat->pointer);
+ iwr_lcl.u.data.length = iwp_compat->length;
+ iwr_lcl.u.data.flags = iwp_compat->flags;
+
+ ret = dev->netdev_ops->ndo_do_ioctl(dev, (void *) &iwr_lcl, cmd);
+
+ iwp_compat->pointer = ptr_to_compat(iwr_lcl.u.data.pointer);
+ iwp_compat->length = iwr_lcl.u.data.length;
+ iwp_compat->flags = iwr_lcl.u.data.flags;
+
+ return ret;
+ } else
+#endif
+ return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+ }
return -EOPNOTSUPP;
}
OpenPOWER on IntegriCloud